1#!/usr/local/bin/perl -w
2
3=head1 NAME
4
5huge-sort.pl - Sort a --tokenlist of bigrams from huge-count.pl in alphabetical order.
6
7=head1 SYNOPSIS
8
9count.pl --tokenlist input.out input
10
11huge-sort.pl --keep input.out
12
13=head1 DESCRIPTION
14
15huge-sort.pl takes as input a duplicate bigram file generate
16by count.pl with --tokenlist option, counts the frequency of each
17bigram and sorts them in alphabetical order.
18
19The output file will be found in input-file.sorted.
20
21This program is used internally by huge-count.pl.
22
23=head1 USGAE
24
25huge-sort.pl [OPTIONS] SOURCE
26
27=head1 INPUT
28
29=head2 Required Arguments:
30
31=head3 SOURCE
32
33Input to huge-sort.pl should be a single flat file generated by
34count.pl with --tokenlist option. The result file is the input
35source file with '-sorted' extention,  SOURCE-sorted.
36
37=head2 Optional Arguments:
38
39=head4 --keep
40
41Switches ON the --keep option will keep the input unsorted file.
42
43=head3 Other Options:
44
45=head4 --help
46
47Displays the help information.
48
49=head4 --version
50
51Displays the version information.
52
53=head1 AUTHOR
54
55Ying Liu, University of Minnesota, Twin Cities.
56liux0395 at umn.edu
57
58Ted Pedersen, University of Minnesota, Duluth.
59tpederse at umn.edu
60
61=head1 COPYRIGHT
62
63Copyright (C) 2009-2011, Ying Liu and Ted Pedersen
64
65This program is free software; you can redistribute it and/or
66modify it under the terms of the GNU General Public License
67as published by the Free Software Foundation; either version 2
68of the License, or (at your option) any later version.
69This program is distributed in the hope that it will be useful,
70but WITHOUT ANY WARRANTY; without even the implied warranty of
71MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
72GNU General Public License for more details.
73
74You should have received a copy of the GNU General Public License
75along with this program; if not, write to the Free Software
76Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
77
78=cut
79
80
81###############################################################################
82#-----------------------------------------------------------------------------
83#                              Start of program
84#-----------------------------------------------------------------------------
85###################################################################################
86
87use Getopt::Long;
88
89# first check if no commandline options have been provided... in which case
90# print out the usage notes!
91if ( $#ARGV == -1 )
92{
93    &minimalUsageNotes();
94    exit;
95}
96
97# now get the options!
98GetOptions( "keep", "version", "help" );
99
100if ( defined $opt_keep )    { $opt_keep = 1 }
101else                          { $opt_keep = 0 }
102
103# if help has been requested, print out help!
104if ( defined $opt_help )
105{
106    $opt_help = 1;
107    &showHelp();
108    exit;
109}
110
111# if version has been requested, show version!
112if ( defined $opt_version )
113{
114    $opt_version = 1;
115    &showVersion();
116    exit;
117}
118
119
120my $file = $ARGV[0];
121
122open(FILE, "<$file") or die("Error: cannot open file '$file'\n");
123
124# get the frequency of each unique bigrams
125my %bigrams = ();
126my %w1 = ();
127my %w2 = ();
128while (my $line = <FILE>)
129{
130	chop ($line);
131	$bigrams{$line}++;
132	my @words = split('<>', $line);
133	$w1{$words[0]}++;
134	$w2{$words[1]}++;
135}
136close FILE;
137
138
139# sort the bigrams in the alphabet order
140my $sorted = "$file" . "-sorted";
141open(SORT, ">$sorted") or die("Error: cannot open file '$sorted'\n");
142
143foreach my $b (sort (keys %bigrams))
144{
145	print SORT "$b$bigrams{$b} ";
146	my @words = split('<>', $b);
147	print SORT "$w1{$words[0]} $w2{$words[1]}\n";
148}
149close SORT;
150
151# remove the unsorted duplicated bigrams
152if ($opt_keep == 0)
153{
154	system ("rm $file");
155}
156
157#-----------------------------------------------------------------------------
158#                       User Defined Function Definitions
159#-----------------------------------------------------------------------------
160
161# function to output a minimal usage note when the user has not provided any
162# commandline options
163sub minimalUsageNotes
164{
165    print STDERR "Usage: huge-sort.pl [OPTIONS] SOURCE\n";
166    askHelp();
167}
168
169# function to output "ask for help" message when the user's goofed up!
170sub askHelp
171{
172    print STDERR "Type huge-sort.pl --help for help.\n";
173}
174
175# function to output help messages for this program
176sub showHelp
177{
178    print "\n";
179    print "Usage: huge-sort.pl [OPTIONS] SOURCE\n\n";
180    print "huge-sort.pl takes a file created by huge-count.pl --tokenlist\n";
181    print "(or count.pl --tokenlist) as input, and determines the frequency\n";
182    print "of each unique bigram. These bigrams are displayed in alphabetical order.\n";
183
184    print "OPTIONS:\n\n";
185
186    print "  --keep             keep the unsorted file\n";
187    print "                     The default is to delete the unsorted file. \n\n";
188
189    print "  --help             Prints this help message.\n\n";
190    print "  --version          Prints this version message.\n\n";
191}
192
193# function to output the version number
194sub showVersion
195{
196    print STDERR 'huge-sort.pl $Id: huge-sort.pl,v 1.10 2011/03/31 23:04:04 tpederse Exp $';
197    print STDERR "\nCopyright (C) 2009-2011, Ying Liu\n";
198
199}
200
201
202