1#
2# -*- Perl -*-
3# $Id: wakati.pl,v 1.9.8.10 2009-01-28 17:54:57 opengl2772 Exp $
4# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
5# Copyright (C) 2000-2009 Namazu Project All rights reserved.
6#     This is free software with ABSOLUTELY NO WARRANTY.
7#
8#  This program is free software; you can redistribute it and/or modify
9#  it under the terms of the GNU General Public License as published by
10#  the Free Software Foundation; either versions 2, or (at your option)
11#  any later version.
12#
13#  This program is distributed in the hope that it will be useful
14#  but WITHOUT ANY WARRANTY; without even the implied warranty of
15#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16#  GNU General Public License for more details.
17#
18#  You should have received a copy of the GNU General Public License
19#  along with this program; if not, write to the Free Software
20#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21#  02111-1307, USA
22#
23#  This file must be encoded in EUC-JP encoding
24#
25
26package wakati;
27use strict;
28
29# Do wakatigaki processing for a Japanese text.
30sub wakatize_japanese ($) {
31    my ($content) = @_;
32
33    my @tmp = wakatize_japanese_sub($content);
34
35    # Remove words consists of only Hiragana characters
36    # when -H option is specified.
37    # Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
38    # [1997-11-13]
39    # And do Okurigana processing. [1998-04-24]
40    if ($var::Opt{'hiragana'} || $var::Opt{'okurigana'}) {
41        for (my $ndx = 0; $ndx <= $#tmp; ++$ndx) {
42	    $tmp[$ndx] =~ s/(\s)/ $1/g;
43	    $tmp[$ndx] = ' ' . $tmp[$ndx];
44	    if ($var::Opt{'okurigana'}) {
45		$tmp[$ndx] =~ s/((?:[^\xa4][\xa1-\xfe])+)(?:\xa4[\xa1-\xf3])+ /$1 /g;
46	    }
47	    if ($var::Opt{'hiragana'}) {
48		$tmp[$ndx] =~ s/ (?:\xa4[\xa1-\xf3])+ //g;
49	    }
50        }
51    }
52
53    # Collect only noun words when -m option is specified.
54    if ($var::Opt{'noun'}) {
55	$$content = "";
56	$$content .= shift(@tmp) =~ /(.+ )\xcc\xbe\xbb\xec/ ? $1 : "" while @tmp;
57	# noun (meisi) in Japanese "cc be bb ec"
58    } else {
59	$$content = join("\n", @tmp);
60    }
61    $$content =~ s/^\s+//gm;
62    $$content =~ s/\s+$//gm;
63    $$content =~ s/ +/ /gm;
64    $$content .= "\n";
65    util::dprint(_("-- wakatized content --\n")."$$content\n");
66}
67
68sub wakatize_japanese_sub ($) {
69    my ($content) = @_;
70    my $str = "";
71    my @tmp = ();
72
73    if ($conf::WAKATI =~ /^module_(\w+)/) {
74	my $module = $1;
75	if ($module eq "kakasi") {
76	    $str = $$content;
77	    $str =~ s/([\x80-\xff]+)/{my $text = Text::Kakasi::do_kakasi($1); " $text ";}/ge;
78	} elsif ($module eq "chasen") {
79            if ($var::Opt{'noun'}) {
80	        $str = Text::ChaSen::sparse_tostr_long($$content);
81            } else {
82                $str = $$content;
83	        $str =~ s/([\x80-\xff]+)/{my $text = Text::ChaSen::sparse_tostr_long($1); " $text ";}/ge;
84            }
85	} elsif ($module eq "mecab") {
86	    use vars qw($t);
87	    if (!defined $t) {
88		require MeCab;
89		import MeCab;
90                eval '$t = new MeCab::Tagger("-Owakati");' or
91                    $t = new MeCab::Tagger([qw(mecab -O wakati)]);
92	    }
93	    END {
94		$t->DESTROY() if defined $t;
95	    };
96            $str = $$content;
97            $str =~ s/([\x80-\xff]+)/{my $s = $1; my $text = $t->parse($s); " $text ";}/ge;
98	} else {
99	    util::cdie(_("invalid wakati module: ")."$module\n");
100	}
101        util::dprint(_("-- wakatized bare content --\n")."$str\n\n");
102	@tmp = split('\n', $str);
103    } else {
104	my $tmpfile = util::tmpnam("NMZ.wakati");
105        util::dprint(_("wakati: using ")."$conf::WAKATI\n");
106	# Don't use IPC::Open2 because it's not efficent.
107        if ($var::Opt{'noun'}) {
108            my $fh_wakati = util::efopen("|$conf::WAKATI > $tmpfile");
109            print $fh_wakati $$content;
110            util::fclose($fh_wakati);
111        } else {
112            $str = $$content;
113
114            my $redirect = ">";
115            while(1) {
116                if ($str =~ s/^([\x80-\xff]+)//s) {
117                    my $fh_wakati = util::efopen("|$conf::WAKATI $redirect $tmpfile");
118                    print $fh_wakati " $1\n";
119                    util::fclose($fh_wakati);
120                } elsif ($str =~ s/^([\x00-\x7f]+)//s) {
121                    my $fh_wakati = util::efopen("$redirect $tmpfile");
122                    print $fh_wakati " $1 ";
123                    util::fclose($fh_wakati);
124                } else {
125                    last;
126                }
127
128                $redirect = ">>";
129            }
130	}
131	{
132	    my $fh_wakati = util::efopen($tmpfile);
133	    @tmp = <$fh_wakati>;
134	    chomp @tmp;
135            util::fclose($fh_wakati);
136	}
137	unlink $tmpfile;
138    }
139
140    return @tmp;
141}
142
1431;
144