1=comment 2 YamCha -- Yet Another Multipurpose CHunk Annotator 3 4 $Id: PKI.pm,v 1.2 2004/09/20 09:59:16 taku-ku Exp $; 5 6 Copyright (C) 2000-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp> 7 This is free software with ABSOLUTELY NO WARRANTY. 8 9 This library is free software; you can redistribute it and/or 10 modify it under the terms of the GNU Lesser General Public 11 License as published by the Free Software Foundation; either 12 version 2.1 of the License, or (at your option) any later version. 13 14 This library is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 Lesser General Public License for more details. 18 19 You should have received a copy of the GNU Lesser General Public 20 License along with this library; if not, write to the Free Software 21 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22=cut 23 24package PKI; 25use File::Spec; 26use strict; 27use vars qw (@sv %fi2si %example 28 $svidx $svsize $dsize $ndsize $dasize $fsize $tsize 29 $kernel_type $degree $param_g $param_s $param_r 30 $outfile $tooldir $mkdarts); 31 32sub initialize 33{ 34 ($outfile, $tooldir) = @_; 35 36 die "FATAL: empty file name\n" if (! defined $outfile || $outfile eq ""); 37 38 $mkdarts = File::Spec->catfile ($tooldir, "mkdarts"); 39 40 @sv = (); 41 %fi2si = (); 42 %example = (); 43 $dsize = 0; 44 $ndsize = 0; 45 $svidx = 0; 46} 47 48sub get_type { return 1; } # PKB:0 PKI:1 PKE:2 49 50sub get_header 51{ 52 # INT x 8 53 return ($dsize, # dimension of model, 54 $ndsize, # max non dimension, which is used cache of dot products. 55 $dasize, # size of double array 56 $svsize, # size of support vectors; 57 $tsize, # table size 58 $fsize, # feature size 59 0, 0); # dummy (reserved area) 60} 61 62sub finalize 63{ 64 return if (! defined $outfile || $outfile eq ""); 65 for (get_concat_files()) { 66 unlink $_ if (-f $_); 67 } 68} 69 70sub set_kernel_param 71{ 72 ($kernel_type,$degree, $param_g, $param_s, $param_r) = @_; 73} 74 75sub process_line 76{ 77 my ($line, $m) = @_; 78 79 my ($alpha, $ex) = split /\s+/, $line, 2; 80 my @tmp = split /\s+/, $ex; 81 82 if (! defined $example{$ex}) { 83 for (@tmp) { 84 my ($i ,$v) = split /:/, $_; 85 $dsize = $dsize > $i ? $dsize : $i; 86 push @{$fi2si{$i}}, $svidx; 87 } 88 $ndsize = $ndsize > ($#tmp+1) ? $ndsize: ($#tmp+1); 89 $example{$ex} = $svidx; 90 $sv[$svidx]->{$m} += $alpha; 91 ++$svidx; 92 } else { 93 my $idx = $example{$ex}; 94 $sv[$idx]->{$m} += $alpha; 95 } 96} 97 98sub mkmodel 99{ 100 my @dic = @{$_[0]}; # dic 101 my @model = @{$_[1]}; # model param 102 103 my $msize = scalar (@model); 104 105 ++$svidx; 106 $svsize = $svidx; 107 108 ++$dsize; 109 110 %example = (); 111 undef %example; 112 113 $tsize = 0; 114 my %old2new = (); 115 open (S, "> $outfile.t") || die "$!: $outfile.t\n"; binmode S; 116 for (my $i = 0; $i < $dsize; ++$i) { 117 print "." if ($i % 1000 == 0); 118 if (defined $fi2si{$i}) { 119 $old2new{$i} = $tsize; 120 $tsize += (scalar (@{$fi2si{$i}}) + 1); 121 print S pack ("i*", @{$fi2si{$i}}); 122 print S pack ("i", -1); 123 } else { 124 $old2new{$i} = -1; 125 } 126 } 127 close (S); 128 129 $fsize = 0; 130 open (S1, "> $outfile.idx") || die "$!: $outfile.idx\n"; binmode S1; 131 open (S2, "> $outfile.alpha") || die "$!: $outfile.alpha\n"; binmode S2; 132 for (my $m = 0; $m < $msize; ++$m) { 133 for my $idx (0 .. $#sv) { 134 print "." if ($idx % 10000 == 0); 135 if (defined $sv[$idx]->{$m}) { 136 print S1 pack("i", $idx); 137 print S2 pack("d", $sv[$idx]->{$m}); 138 ++$fsize; 139 } 140 } 141 print S1 pack("i", -1); 142 print S2 pack("d", 0.0); 143 ++$fsize; 144 } 145 close (S1); 146 close (S2); 147 148 print "\n"; 149 150 open (S, "| $mkdarts - $outfile.da") || die "$!: $mkdarts\n"; 151 for (@dic) { 152 my ($i, $str) = @{$_}; 153 my $n = $old2new{$i}; 154 print S "$n $str\n" if ($n != -1); 155 } 156 close (S); 157 $dasize = (stat("$outfile.da"))[7]; 158 159 # aligned 8 160 my $n = $dasize + $tsize + $fsize; 161 if ($n % 2 == 1) { 162 open (S1, ">> $outfile.idx") || die "$!: $outfile.idx\n"; binmode S1; 163 open (S2, ">> $outfile.alpha") || die "$!: $outfile.alpha\n"; binmode S2; 164 print S1 pack("i", -1); 165 print S2 pack("d", 0.0); 166 close (S1); 167 close (S2); 168 ++$fsize; 169 } 170} 171 172sub get_concat_files 173{ 174 return ("$outfile.da", "$outfile.t", "$outfile.idx", "$outfile.alpha"); 175} 176 1771; 178