1#!/usr/bin/perl -w
2
3# huge-combine3.pl : Combines trigram files.
4
5eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
6    if 0; # not running under some shell
7
8=head1 NAME
9
10huge-combine3.pl - Combine two trigram files created by count.pl into single file
11
12=head1 SYNOPSIS
13
14Combines two trigram files created by count.pl into a single trigram file.
15
16=head1 USGAE
17
18huge-combine3.pl [OPTIONS] COUNT1 COUNT2
19
20=head1 INPUT
21
22=head2 Required Arguments:
23
24=head3 COUNT1 and COUNT2
25
26combine-count.pl takes two trigram files created by count.pl as input.
27If COUNT1 and COUNT2 are of unequal sizes, it is strongly recommended
28that COUNT1 should be the smaller file and COUNT2 should be the lager
29trigram file.
30
31Each line in files COUNT1, COUNT2 should be formatted as -
32
33word1<>word2<>n11 n1p np1
34
35where word1<>word2 is a trigram, n11 is the joint frequency score of this
36trigram, n1p is the number of trigrams in which word1 is the first word,
37while np1 is the number of trigrams having word2 as the second word.
38
39=head2 Optional Arguments:
40
41=head4 --help
42
43Displays this message.
44
45=head4 --version
46
47Displays the version information.
48
49=head1 OUTPUT
50
51Output displays all trigrams that appear either in COUNT1 (inclusive) or
52in COUNT2 along with their updated scores. Scores are updated such that -
53
54=over
55
56=item 1:
57
58If a trigram appears in both COUNT1 and COUNT2, their n11 scores are added.
59
60e.g. If COUNT1 contains a trigram
61	word1<>word2<>n11 n1p np1
62and COUNT2 has a trigram
63	word1<>word2<>m11 m1p mp1
64
65Then, the new n11 score of trigram word1<>word2 is n11+m11
66
67=item 2:
68
69If the two trigrams belonging to COUNT1 and COUNT2 share a commom first word,
70their n1p scores are added.
71
72e.g. If COUNT1 contains a trigram
73	word1<>word2<>n11 n1p np1
74and if COUNT2 contains a trigram
75	word1<>word3<>m11 m1p mp1
76
77Then, the n1p marginal score of word1 is updated to n1p+m1p
78
79=item 3:
80
81If the two trigrams belonging to COUNT1 and COUNT2 share a commom second word,
82their np1 scores are added.
83
84e.g. If COUNT1 contains a trigram
85        word1<>word2<>n11 n1p np1
86and if COUNT2 contains a trigram
87        word3<>word2<>m11 m1p mp1
88
89Then, the np1 marginal score of word2 is updated to np1+mp1
90
91=back
92
93=head1 AUTHOR
94
95Amruta Purandare, Ted Pedersen.
96University of Minnesota at Duluth.
97
98=head1 COPYRIGHT
99
100Copyright (c) 2004, 2009
101
102Amruta Purandare, University of Minnesota, Duluth.
103pura0010@umn.edu
104
105Ted Pedersen, University of Minnesota, Duluth.
106tpederse@umn.edu
107
108Cyrus Shaoul, University of Alberta, Edmonton
109cyrus.shaoul@ualberta.ca
110
111This program is free software; you can redistribute it and/or modify it under
112the terms of the GNU General Public License as published by the Free Software
113Foundation; either version 2 of the License, or (at your option) any later
114version.
115
116This program is distributed in the hope that it will be useful, but WITHOUT
117ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
118FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
119
120You should have received a copy of the GNU General Public License along with
121this program; if not, write to
122
123The Free Software Foundation, Inc.,
12459 Temple Place - Suite 330,
125Boston, MA  02111-1307, USA.
126
127=cut
128
129###############################################################################
130
131#			===============================
132#                               CODE STARTS HERE
133#			===============================
134
135#$0 contains the program name along with
136#the complete path. Extract just the program
137#name and use in error messages
138$0=~s/.*\/(.+)/$1/;
139
140###############################################################################
141
142#                           ================================
143#                            COMMAND LINE OPTIONS AND USAGE
144#                           ================================
145
146# command line options
147use Getopt::Long;
148GetOptions ("help","version");
149# show help option
150if(defined $opt_help)
151{
152        $opt_help=1;
153        &showhelp();
154        exit;
155}
156
157# show version information
158if(defined $opt_version)
159{
160        $opt_version=1;
161        &showversion();
162        exit;
163}
164
165# show minimal usage message if fewer arguments
166if($#ARGV<1)
167{
168        &showminimal();
169        exit;
170}
171
172#############################################################################
173
174#                       ================================
175#                          INITIALIZATION AND INPUT
176#                       ================================
177
178$small_file=$ARGV[0];
179$big_file=$ARGV[1];
180
181if(!-e $small_file)
182{
183	print STDERR "ERROR($0):
184	COUNT1 file <$small_file> doesn't exist.\n";
185	exit;
186}
187
188if(!-e $big_file)
189{
190        print STDERR "ERROR($0):
191        COUNT2 file <$big_file> doesn't exist.\n";
192        exit;
193}
194
195open(SMALL,$small_file) || die "ERROR($0):
196	Error(code=$!) in opening COUNT1 file <$small_file>.\n";
197open(BIG,$big_file) || die "ERROR($0):
198	Error(code=$!) in opening COUNT2 file <$big_file>.\n";
199
200#############################################################################
201
202#                       ====================
203#                    	    CODE SECTION
204#                       ====================
205
206# loading trigrams from smaller file into memory
207while(<SMALL>)
208{
209	if(/^\s*(\d+)\s*$/)
210	{
211		$total1=$1;
212		next;
213	}
214#	if(/^\s*(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s*$/)
215# 3-gram version
216	if(/^\s*(.*)<>(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*$/)
217	{
218		if(defined $n111{$1}{$2}{$3})
219		{
220			print STDERR "ERROR($0):
221	Trigram <$1<>$2<>$3> is repeated in COUNT1 file <$small_file>.\n";
222			exit;
223		}
224		$n111{$1}{$2}{$3}=$4;
225		if(defined $n1pp{$1} && $n1pp{$1}!=$5)
226		{
227			print STDERR "ERROR($0):
228	Word <$1> has two different n1pp scores in COUNT1 file <$small_file>.\n";
229			exit;
230		}
231		$n1pp{$1}=$5;
232		if(defined $np1p{$2} && $np1p{$2}!=$6)
233                {
234                        print STDERR "ERROR($0):
235        Word <$2> has two different np1p scores in COUNT1 file <$small_file>.\n";
236                        exit;
237                }
238		$np1p{$2}=$6;
239		if(defined $npp1{$3} && $npp1{$3}!=$7)
240                {
241                        print STDERR "ERROR($0):
242        Word <$2> has two different npp1 scores in COUNT1 file <$small_file>.\n";
243                        exit;
244                }
245		$npp1{$3}=$7;
246		if(defined $n11p{$1}{$2} && $n11p{$1}{$2}!=$8)
247                {
248                        print STDERR "ERROR($0):
249        Word <$2> has two different n11p scores in COUNT1 file <$small_file>.\n";
250                        exit;
251                }
252		$n11p{$1}{$2}=$8;
253		if(defined $n1p1{$1}{$3} && $n1p1{$1}{$3}!=$9)
254                {
255                        print STDERR "ERROR($0):
256        Word <$2> has two different n1p1 scores in COUNT1 file <$small_file>.\n";
257                        exit;
258                }
259		$n1p1{$1}{$3}=$9;
260		if(defined $np11{$2}{$3} && $np11{$2}{$3}!=$10)
261                {
262                        print STDERR "ERROR($0):
263        Word <$2> has two different np11 scores in COUNT1 file <$small_file>.\n";
264                        exit;
265                }
266		$np11{$2}{$3}=$10;
267	}
268}
269
270# reading bigger file
271while(<BIG>)
272{
273	# total trigrams
274	if(/^\s*(\d+)\s*$/)
275	{
276		$total2=$1;
277		$total=$total1+$total2;
278		print "$total\n";
279		next;
280	}
281#	if(/^\s*(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s*$/)
282	if(/^\s*(.*)<>(.*)<>(.*)<>(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s*$/)
283	{
284		if(defined $n111{$1}{$2}{$3})
285		{
286			$n111{$1}{$2}{$3}+=$4;
287			# mark the trigrams that appear in both files
288			$update_n111{$1}{$2}{$3}=1;
289			# get the updated n11 score
290			$n111=$n111{$1}{$2}{$3};
291		}
292		else
293		{
294			# trigram appearing only in COUNT2
295			$n111=$4;
296		}
297		if(defined $n1pp{$1})
298		{
299			# update marginals n1p only once
300			if(!defined $update_n1pp{$1})
301			{
302				$n1pp{$1}+=$5;
303				$update_n1pp{$1}=1;
304			}
305			# get the updated n1p score
306			$n1pp=$n1pp{$1};
307		}
308		else
309		{
310			# marginal appearing only in COUNT2
311			$n1pp=$5;
312		}
313		if(defined $np1p{$2})
314		{
315			# update marginals np1 only once
316			if(!defined $update_np1p{$2})
317			{
318				$np1p{$2}+=$6;
319				$update_np1p{$2}=1;
320			}
321			# get the updated np1 score
322			$np1p=$np1p{$2};
323		}
324		else
325		{
326			$np1p=$6;
327		}
328
329		if(defined $npp1{$3})
330		{
331			# update marginals np1 only once
332			if(!defined $update_npp1{$3})
333			{
334				$npp1{$3}+=$7;
335				$update_npp1{$3}=1;
336			}
337			# get the updated np1 score
338			$npp1=$npp1{$3};
339		}
340		else
341		{
342			$npp1=$7;
343		}
344
345		if(defined $n11p{$1}{$2})
346		{
347			# update marginals np1 only once
348			if(!defined $update_n11p{$1}{$2})
349			{
350				$n11p{$1}{$2}+=$8;
351				$update_n11p{$1}{$2}=1;
352			}
353			# get the updated np1 score
354			$n11p=$n11p{$1}{$2};
355		}
356		else
357		{
358			$n11p=$8;
359		}
360
361
362		if(defined $n1p1{$1}{$3})
363		{
364			# update marginals np1 only once
365			if(!defined $update_n1p1{$1}{$3})
366			{
367				$n1p1{$1}{$3}+=$9;
368				$update_n1p1{$1}{$3}=1;
369			}
370			# get the updated np1 score
371			$n1p1=$n1p1{$1}{$3};
372		}
373		else
374		{
375			$n1p1=$9;
376		}
377
378
379
380		if(defined $np11{$2}{$3})
381		{
382			# update marginals np1 only once
383			if(!defined $update_np11{$2}{$3})
384			{
385				$np11{$2}{$3}+=$10;
386				$update_np11{$2}{$3}=1;
387			}
388			# get the updated np1 score
389			$np11=$np11{$2}{$3};
390		}
391		else
392		{
393			$np11=$10;
394		}
395
396		# printing trigrams from COUNT2
397		print "$1<>$2<>$3<>$n111 $n1pp $np1p $npp1 $n11p $n1p1 $np11\n";
398	}
399}
400
401# printing trigrams appearing only in COUNT1
402foreach $word1 (sort keys %n111)
403{
404    foreach $word2 (sort keys %{$n111{$word1}})
405    {
406	foreach $word3 (sort keys %{$n111{$word1}{$word2}})
407	{
408	    # avoiding trigrams that appear in COUNT2
409	    if(!defined $update_n111{$word1}{$word2}{$word3})
410	    {
411		print "$word1<>$word2<>$word3<>$n111{$word1}{$word2}{$word3} $n1pp{$word1} $np1p{$word2} $npp1{$word3} $n11p{$word1}{$word2} $n1p1{$word1}{$word3} $np11{$word2}{$word3}\n";
412	    }
413	}
414    }
415}
416
417##############################################################################
418
419#                      ==========================
420#                          SUBROUTINE SECTION
421#                      ==========================
422
423#-----------------------------------------------------------------------------
424#show minimal usage message
425sub showminimal()
426{
427        print "Usage: huge-combine3.pl [OPTIONS] COUNT1 COUNT2";
428        print "\nTYPE huge-combine3.pl --help for help\n";
429}
430
431#-----------------------------------------------------------------------------
432
433#show help
434sub showhelp()
435{
436        print "Usage:  huge-combine3.pl [OPTIONS] COUNT1 COUNT2
437
438Combines two trigram files COUNT1 and COUNT2.
439
440COUNT1 COUNT2
441	Trigram files created by count.pl.
442
443OPTIONS:
444--help
445        Displays this message.
446--version
447        Displays the version information.
448Type 'perldoc huge-combine3.pl' to view detailed documentation of
449huge-combine3.\n";
450}
451
452#------------------------------------------------------------------------------
453#version information
454sub showversion()
455{
456        print "huge-combine3.pl      -       Version 0.01\n";
457        print "Combines the given two trigram files.\n";
458        print "Copyright (C) 2004, Amruta Purandare & Ted Pedersen.\n";
459        print "Date of Last Update:     03/03/2004\n";
460}
461
462#############################################################################
463
464