1=comment
2  YamCha -- Yet Another Multipurpose CHunk Annotator
3
4  $Id: PKI.pm,v 1.2 2004/09/20 09:59:16 taku-ku Exp $;
5
6  Copyright (C) 2000-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp>
7  This is free software with ABSOLUTELY NO WARRANTY.
8
9  This library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU Lesser General Public
11  License as published by the Free Software Foundation; either
12  version 2.1 of the License, or (at your option) any later version.
13
14  This library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  Lesser General Public License for more details.
18
19  You should have received a copy of the GNU Lesser General Public
20  License along with this library; if not, write to the Free Software
21  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22=cut
23
24package PKI;
25use File::Spec;
26use strict;
27use vars qw (@sv %fi2si %example
28             $svidx $svsize $dsize $ndsize $dasize $fsize $tsize
29             $kernel_type $degree $param_g $param_s $param_r
30             $outfile $tooldir $mkdarts);
31
32sub initialize
33{
34    ($outfile, $tooldir) = @_;
35
36    die "FATAL: empty file name\n" if (! defined $outfile || $outfile eq "");
37
38    $mkdarts = File::Spec->catfile ($tooldir, "mkdarts");
39
40    @sv      = ();
41    %fi2si   = ();
42    %example = ();
43    $dsize   = 0;
44    $ndsize  = 0;
45    $svidx   = 0;
46}
47
48sub get_type   { return 1; } # PKB:0 PKI:1 PKE:2
49
50sub get_header
51{
52    # INT x 8
53    return ($dsize,      # dimension of model,
54	    $ndsize,     # max non dimension, which is used cache of dot products.
55	    $dasize,     # size of double array
56	    $svsize,     # size of support vectors;
57	    $tsize,      # table size
58	    $fsize,      # feature size
59	    0, 0);    # dummy (reserved area)
60}
61
62sub finalize
63{
64    return if (! defined $outfile || $outfile eq "");
65    for (get_concat_files()) {
66	unlink $_ if (-f $_);
67    }
68}
69
70sub set_kernel_param
71{
72    ($kernel_type,$degree, $param_g, $param_s, $param_r) = @_;
73}
74
75sub process_line
76{
77    my ($line, $m) = @_;
78
79    my ($alpha, $ex) = split /\s+/, $line, 2;
80    my @tmp = split /\s+/, $ex;
81
82    if (! defined $example{$ex}) {
83	for (@tmp) {
84	    my ($i ,$v) = split /:/, $_;
85	    $dsize = $dsize > $i ? $dsize : $i;
86	    push @{$fi2si{$i}}, $svidx;
87	}
88	$ndsize = $ndsize > ($#tmp+1) ? $ndsize: ($#tmp+1);
89	$example{$ex} = $svidx;
90	$sv[$svidx]->{$m} += $alpha;
91	++$svidx;
92    } else {
93	my $idx = $example{$ex};
94	$sv[$idx]->{$m} += $alpha;
95    }
96}
97
98sub mkmodel
99{
100    my @dic   = @{$_[0]}; # dic
101    my @model = @{$_[1]}; # model param
102
103    my $msize = scalar (@model);
104
105    ++$svidx;
106    $svsize = $svidx;
107
108    ++$dsize;
109
110    %example = ();
111    undef %example;
112
113    $tsize = 0;
114    my %old2new = ();
115    open (S, "> $outfile.t")  || die "$!: $outfile.t\n"; binmode S;
116    for (my $i = 0; $i < $dsize; ++$i) {
117	print "."  if ($i % 1000 == 0);
118	if (defined $fi2si{$i}) {
119	    $old2new{$i} = $tsize;
120	    $tsize += (scalar (@{$fi2si{$i}}) + 1);
121	    print S pack ("i*", @{$fi2si{$i}});
122	    print S pack ("i", -1);
123	} else {
124	    $old2new{$i} = -1;
125	}
126    }
127    close (S);
128
129    $fsize = 0;
130    open (S1, "> $outfile.idx")   || die "$!: $outfile.idx\n"; binmode S1;
131    open (S2, "> $outfile.alpha")   || die "$!: $outfile.alpha\n"; binmode S2;
132    for (my $m = 0; $m < $msize; ++$m) {
133	for my $idx (0 .. $#sv) {
134	    print "."  if ($idx % 10000 == 0);
135	    if (defined $sv[$idx]->{$m}) {
136		print S1 pack("i", $idx);
137		print S2 pack("d", $sv[$idx]->{$m});
138		++$fsize;
139	    }
140	}
141	print S1 pack("i", -1);
142	print S2 pack("d", 0.0);
143	++$fsize;
144    }
145    close (S1);
146    close (S2);
147
148    print "\n";
149
150    open (S, "| $mkdarts - $outfile.da") || die "$!: $mkdarts\n";
151    for (@dic) {
152	my ($i, $str) = @{$_};
153	my $n = $old2new{$i};
154	print S "$n $str\n" if ($n != -1);
155    }
156    close (S);
157    $dasize = (stat("$outfile.da"))[7];
158
159    # aligned 8
160    my $n = $dasize + $tsize + $fsize;
161    if ($n % 2 == 1) {
162        open (S1, ">> $outfile.idx")    || die "$!: $outfile.idx\n"; binmode S1;
163	open (S2, ">> $outfile.alpha")  || die "$!: $outfile.alpha\n"; binmode S2;
164	print S1 pack("i", -1);
165	print S2 pack("d", 0.0);
166	close (S1);
167	close (S2);
168	++$fsize;
169    }
170}
171
172sub get_concat_files
173{
174    return ("$outfile.da", "$outfile.t", "$outfile.idx", "$outfile.alpha");
175}
176
1771;
178