1#!/usr/bin/perl -w 2 3# huge-combine3.pl : Combines trigram files. 4 5eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}' 6 if 0; # not running under some shell 7 8=head1 NAME 9 10huge-combine3.pl - Combine two trigram files created by count.pl into single file 11 12=head1 SYNOPSIS 13 14Combines two trigram files created by count.pl into a single trigram file. 15 16=head1 USGAE 17 18huge-combine3.pl [OPTIONS] COUNT1 COUNT2 19 20=head1 INPUT 21 22=head2 Required Arguments: 23 24=head3 COUNT1 and COUNT2 25 26combine-count.pl takes two trigram files created by count.pl as input. 27If COUNT1 and COUNT2 are of unequal sizes, it is strongly recommended 28that COUNT1 should be the smaller file and COUNT2 should be the lager 29trigram file. 30 31Each line in files COUNT1, COUNT2 should be formatted as - 32 33word1<>word2<>n11 n1p np1 34 35where word1<>word2 is a trigram, n11 is the joint frequency score of this 36trigram, n1p is the number of trigrams in which word1 is the first word, 37while np1 is the number of trigrams having word2 as the second word. 38 39=head2 Optional Arguments: 40 41=head4 --help 42 43Displays this message. 44 45=head4 --version 46 47Displays the version information. 48 49=head1 OUTPUT 50 51Output displays all trigrams that appear either in COUNT1 (inclusive) or 52in COUNT2 along with their updated scores. Scores are updated such that - 53 54=over 55 56=item 1: 57 58If a trigram appears in both COUNT1 and COUNT2, their n11 scores are added. 59 60e.g. If COUNT1 contains a trigram 61 word1<>word2<>n11 n1p np1 62and COUNT2 has a trigram 63 word1<>word2<>m11 m1p mp1 64 65Then, the new n11 score of trigram word1<>word2 is n11+m11 66 67=item 2: 68 69If the two trigrams belonging to COUNT1 and COUNT2 share a commom first word, 70their n1p scores are added. 71 72e.g. If COUNT1 contains a trigram 73 word1<>word2<>n11 n1p np1 74and if COUNT2 contains a trigram 75 word1<>word3<>m11 m1p mp1 76 77Then, the n1p marginal score of word1 is updated to n1p+m1p 78 79=item 3: 80 81If the two trigrams belonging to COUNT1 and COUNT2 share a commom second word, 82their np1 scores are added. 83 84e.g. If COUNT1 contains a trigram 85 word1<>word2<>n11 n1p np1 86and if COUNT2 contains a trigram 87 word3<>word2<>m11 m1p mp1 88 89Then, the np1 marginal score of word2 is updated to np1+mp1 90 91=back 92 93=head1 AUTHOR 94 95Amruta Purandare, Ted Pedersen. 96University of Minnesota at Duluth. 97 98=head1 COPYRIGHT 99 100Copyright (c) 2004, 2009 101 102Amruta Purandare, University of Minnesota, Duluth. 103pura0010@umn.edu 104 105Ted Pedersen, University of Minnesota, Duluth. 106tpederse@umn.edu 107 108Cyrus Shaoul, University of Alberta, Edmonton 109cyrus.shaoul@ualberta.ca 110 111This program is free software; you can redistribute it and/or modify it under 112the terms of the GNU General Public License as published by the Free Software 113Foundation; either version 2 of the License, or (at your option) any later 114version. 115 116This program is distributed in the hope that it will be useful, but WITHOUT 117ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 118FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 119 120You should have received a copy of the GNU General Public License along with 121this program; if not, write to 122 123The Free Software Foundation, Inc., 12459 Temple Place - Suite 330, 125Boston, MA 02111-1307, USA. 126 127=cut 128 129############################################################################### 130 131# =============================== 132# CODE STARTS HERE 133# =============================== 134 135#$0 contains the program name along with 136#the complete path. Extract just the program 137#name and use in error messages 138$0=~s/.*\/(.+)/$1/; 139 140############################################################################### 141 142# ================================ 143# COMMAND LINE OPTIONS AND USAGE 144# ================================ 145 146# command line options 147use Getopt::Long; 148GetOptions ("help","version"); 149# show help option 150if(defined $opt_help) 151{ 152 $opt_help=1; 153 &showhelp(); 154 exit; 155} 156 157# show version information 158if(defined $opt_version) 159{ 160 $opt_version=1; 161 &showversion(); 162 exit; 163} 164 165# show minimal usage message if fewer arguments 166if($#ARGV<1) 167{ 168 &showminimal(); 169 exit; 170} 171 172############################################################################# 173 174# ================================ 175# INITIALIZATION AND INPUT 176# ================================ 177 178$small_file=$ARGV[0]; 179$big_file=$ARGV[1]; 180 181if(!-e $small_file) 182{ 183 print STDERR "ERROR($0): 184 COUNT1 file <$small_file> doesn't exist.\n"; 185 exit; 186} 187 188if(!-e $big_file) 189{ 190 print STDERR "ERROR($0): 191 COUNT2 file <$big_file> doesn't exist.\n"; 192 exit; 193} 194 195open(SMALL,$small_file) || die "ERROR($0): 196 Error(code=$!) in opening COUNT1 file <$small_file>.\n"; 197open(BIG,$big_file) || die "ERROR($0): 198 Error(code=$!) in opening COUNT2 file <$big_file>.\n"; 199 200############################################################################# 201 202# ==================== 203# CODE SECTION 204# ==================== 205 206# loading trigrams from smaller file into memory 207while(<SMALL>) 208{ 209 if(/^\s*(\d+)\s*$/) 210 { 211 $total1=$1; 212 next; 213 } 214# if(/^\s*(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s*$/) 215# 3-gram version 216 if(/^\s*(.*)<>(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) 217 { 218 if(defined $n111{$1}{$2}{$3}) 219 { 220 print STDERR "ERROR($0): 221 Trigram <$1<>$2<>$3> is repeated in COUNT1 file <$small_file>.\n"; 222 exit; 223 } 224 $n111{$1}{$2}{$3}=$4; 225 if(defined $n1pp{$1} && $n1pp{$1}!=$5) 226 { 227 print STDERR "ERROR($0): 228 Word <$1> has two different n1pp scores in COUNT1 file <$small_file>.\n"; 229 exit; 230 } 231 $n1pp{$1}=$5; 232 if(defined $np1p{$2} && $np1p{$2}!=$6) 233 { 234 print STDERR "ERROR($0): 235 Word <$2> has two different np1p scores in COUNT1 file <$small_file>.\n"; 236 exit; 237 } 238 $np1p{$2}=$6; 239 if(defined $npp1{$3} && $npp1{$3}!=$7) 240 { 241 print STDERR "ERROR($0): 242 Word <$2> has two different npp1 scores in COUNT1 file <$small_file>.\n"; 243 exit; 244 } 245 $npp1{$3}=$7; 246 if(defined $n11p{$1}{$2} && $n11p{$1}{$2}!=$8) 247 { 248 print STDERR "ERROR($0): 249 Word <$2> has two different n11p scores in COUNT1 file <$small_file>.\n"; 250 exit; 251 } 252 $n11p{$1}{$2}=$8; 253 if(defined $n1p1{$1}{$3} && $n1p1{$1}{$3}!=$9) 254 { 255 print STDERR "ERROR($0): 256 Word <$2> has two different n1p1 scores in COUNT1 file <$small_file>.\n"; 257 exit; 258 } 259 $n1p1{$1}{$3}=$9; 260 if(defined $np11{$2}{$3} && $np11{$2}{$3}!=$10) 261 { 262 print STDERR "ERROR($0): 263 Word <$2> has two different np11 scores in COUNT1 file <$small_file>.\n"; 264 exit; 265 } 266 $np11{$2}{$3}=$10; 267 } 268} 269 270# reading bigger file 271while(<BIG>) 272{ 273 # total trigrams 274 if(/^\s*(\d+)\s*$/) 275 { 276 $total2=$1; 277 $total=$total1+$total2; 278 print "$total\n"; 279 next; 280 } 281# if(/^\s*(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s*$/) 282 if(/^\s*(.*)<>(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) 283 { 284 if(defined $n111{$1}{$2}{$3}) 285 { 286 $n111{$1}{$2}{$3}+=$4; 287 # mark the trigrams that appear in both files 288 $update_n111{$1}{$2}{$3}=1; 289 # get the updated n11 score 290 $n111=$n111{$1}{$2}{$3}; 291 } 292 else 293 { 294 # trigram appearing only in COUNT2 295 $n111=$4; 296 } 297 if(defined $n1pp{$1}) 298 { 299 # update marginals n1p only once 300 if(!defined $update_n1pp{$1}) 301 { 302 $n1pp{$1}+=$5; 303 $update_n1pp{$1}=1; 304 } 305 # get the updated n1p score 306 $n1pp=$n1pp{$1}; 307 } 308 else 309 { 310 # marginal appearing only in COUNT2 311 $n1pp=$5; 312 } 313 if(defined $np1p{$2}) 314 { 315 # update marginals np1 only once 316 if(!defined $update_np1p{$2}) 317 { 318 $np1p{$2}+=$6; 319 $update_np1p{$2}=1; 320 } 321 # get the updated np1 score 322 $np1p=$np1p{$2}; 323 } 324 else 325 { 326 $np1p=$6; 327 } 328 329 if(defined $npp1{$3}) 330 { 331 # update marginals np1 only once 332 if(!defined $update_npp1{$3}) 333 { 334 $npp1{$3}+=$7; 335 $update_npp1{$3}=1; 336 } 337 # get the updated np1 score 338 $npp1=$npp1{$3}; 339 } 340 else 341 { 342 $npp1=$7; 343 } 344 345 if(defined $n11p{$1}{$2}) 346 { 347 # update marginals np1 only once 348 if(!defined $update_n11p{$1}{$2}) 349 { 350 $n11p{$1}{$2}+=$8; 351 $update_n11p{$1}{$2}=1; 352 } 353 # get the updated np1 score 354 $n11p=$n11p{$1}{$2}; 355 } 356 else 357 { 358 $n11p=$8; 359 } 360 361 362 if(defined $n1p1{$1}{$3}) 363 { 364 # update marginals np1 only once 365 if(!defined $update_n1p1{$1}{$3}) 366 { 367 $n1p1{$1}{$3}+=$9; 368 $update_n1p1{$1}{$3}=1; 369 } 370 # get the updated np1 score 371 $n1p1=$n1p1{$1}{$3}; 372 } 373 else 374 { 375 $n1p1=$9; 376 } 377 378 379 380 if(defined $np11{$2}{$3}) 381 { 382 # update marginals np1 only once 383 if(!defined $update_np11{$2}{$3}) 384 { 385 $np11{$2}{$3}+=$10; 386 $update_np11{$2}{$3}=1; 387 } 388 # get the updated np1 score 389 $np11=$np11{$2}{$3}; 390 } 391 else 392 { 393 $np11=$10; 394 } 395 396 # printing trigrams from COUNT2 397 print "$1<>$2<>$3<>$n111 $n1pp $np1p $npp1 $n11p $n1p1 $np11\n"; 398 } 399} 400 401# printing trigrams appearing only in COUNT1 402foreach $word1 (sort keys %n111) 403{ 404 foreach $word2 (sort keys %{$n111{$word1}}) 405 { 406 foreach $word3 (sort keys %{$n111{$word1}{$word2}}) 407 { 408 # avoiding trigrams that appear in COUNT2 409 if(!defined $update_n111{$word1}{$word2}{$word3}) 410 { 411 print "$word1<>$word2<>$word3<>$n111{$word1}{$word2}{$word3} $n1pp{$word1} $np1p{$word2} $npp1{$word3} $n11p{$word1}{$word2} $n1p1{$word1}{$word3} $np11{$word2}{$word3}\n"; 412 } 413 } 414 } 415} 416 417############################################################################## 418 419# ========================== 420# SUBROUTINE SECTION 421# ========================== 422 423#----------------------------------------------------------------------------- 424#show minimal usage message 425sub showminimal() 426{ 427 print "Usage: huge-combine3.pl [OPTIONS] COUNT1 COUNT2"; 428 print "\nTYPE huge-combine3.pl --help for help\n"; 429} 430 431#----------------------------------------------------------------------------- 432 433#show help 434sub showhelp() 435{ 436 print "Usage: huge-combine3.pl [OPTIONS] COUNT1 COUNT2 437 438Combines two trigram files COUNT1 and COUNT2. 439 440COUNT1 COUNT2 441 Trigram files created by count.pl. 442 443OPTIONS: 444--help 445 Displays this message. 446--version 447 Displays the version information. 448Type 'perldoc huge-combine3.pl' to view detailed documentation of 449huge-combine3.\n"; 450} 451 452#------------------------------------------------------------------------------ 453#version information 454sub showversion() 455{ 456 print "huge-combine3.pl - Version 0.01\n"; 457 print "Combines the given two trigram files.\n"; 458 print "Copyright (C) 2004, Amruta Purandare & Ted Pedersen.\n"; 459 print "Date of Last Update: 03/03/2004\n"; 460} 461 462############################################################################# 463 464