1#!/usr/local/bin/perl -w 2 3=head1 NAME 4 5huge-sort.pl - Sort a --tokenlist of bigrams from huge-count.pl in alphabetical order. 6 7=head1 SYNOPSIS 8 9count.pl --tokenlist input.out input 10 11huge-sort.pl --keep input.out 12 13=head1 DESCRIPTION 14 15huge-sort.pl takes as input a duplicate bigram file generate 16by count.pl with --tokenlist option, counts the frequency of each 17bigram and sorts them in alphabetical order. 18 19The output file will be found in input-file.sorted. 20 21This program is used internally by huge-count.pl. 22 23=head1 USGAE 24 25huge-sort.pl [OPTIONS] SOURCE 26 27=head1 INPUT 28 29=head2 Required Arguments: 30 31=head3 SOURCE 32 33Input to huge-sort.pl should be a single flat file generated by 34count.pl with --tokenlist option. The result file is the input 35source file with '-sorted' extention, SOURCE-sorted. 36 37=head2 Optional Arguments: 38 39=head4 --keep 40 41Switches ON the --keep option will keep the input unsorted file. 42 43=head3 Other Options: 44 45=head4 --help 46 47Displays the help information. 48 49=head4 --version 50 51Displays the version information. 52 53=head1 AUTHOR 54 55Ying Liu, University of Minnesota, Twin Cities. 56liux0395 at umn.edu 57 58Ted Pedersen, University of Minnesota, Duluth. 59tpederse at umn.edu 60 61=head1 COPYRIGHT 62 63Copyright (C) 2009-2011, Ying Liu and Ted Pedersen 64 65This program is free software; you can redistribute it and/or 66modify it under the terms of the GNU General Public License 67as published by the Free Software Foundation; either version 2 68of the License, or (at your option) any later version. 69This program is distributed in the hope that it will be useful, 70but WITHOUT ANY WARRANTY; without even the implied warranty of 71MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 72GNU General Public License for more details. 73 74You should have received a copy of the GNU General Public License 75along with this program; if not, write to the Free Software 76Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 77 78=cut 79 80 81############################################################################### 82#----------------------------------------------------------------------------- 83# Start of program 84#----------------------------------------------------------------------------- 85################################################################################### 86 87use Getopt::Long; 88 89# first check if no commandline options have been provided... in which case 90# print out the usage notes! 91if ( $#ARGV == -1 ) 92{ 93 &minimalUsageNotes(); 94 exit; 95} 96 97# now get the options! 98GetOptions( "keep", "version", "help" ); 99 100if ( defined $opt_keep ) { $opt_keep = 1 } 101else { $opt_keep = 0 } 102 103# if help has been requested, print out help! 104if ( defined $opt_help ) 105{ 106 $opt_help = 1; 107 &showHelp(); 108 exit; 109} 110 111# if version has been requested, show version! 112if ( defined $opt_version ) 113{ 114 $opt_version = 1; 115 &showVersion(); 116 exit; 117} 118 119 120my $file = $ARGV[0]; 121 122open(FILE, "<$file") or die("Error: cannot open file '$file'\n"); 123 124# get the frequency of each unique bigrams 125my %bigrams = (); 126my %w1 = (); 127my %w2 = (); 128while (my $line = <FILE>) 129{ 130 chop ($line); 131 $bigrams{$line}++; 132 my @words = split('<>', $line); 133 $w1{$words[0]}++; 134 $w2{$words[1]}++; 135} 136close FILE; 137 138 139# sort the bigrams in the alphabet order 140my $sorted = "$file" . "-sorted"; 141open(SORT, ">$sorted") or die("Error: cannot open file '$sorted'\n"); 142 143foreach my $b (sort (keys %bigrams)) 144{ 145 print SORT "$b$bigrams{$b} "; 146 my @words = split('<>', $b); 147 print SORT "$w1{$words[0]} $w2{$words[1]}\n"; 148} 149close SORT; 150 151# remove the unsorted duplicated bigrams 152if ($opt_keep == 0) 153{ 154 system ("rm $file"); 155} 156 157#----------------------------------------------------------------------------- 158# User Defined Function Definitions 159#----------------------------------------------------------------------------- 160 161# function to output a minimal usage note when the user has not provided any 162# commandline options 163sub minimalUsageNotes 164{ 165 print STDERR "Usage: huge-sort.pl [OPTIONS] SOURCE\n"; 166 askHelp(); 167} 168 169# function to output "ask for help" message when the user's goofed up! 170sub askHelp 171{ 172 print STDERR "Type huge-sort.pl --help for help.\n"; 173} 174 175# function to output help messages for this program 176sub showHelp 177{ 178 print "\n"; 179 print "Usage: huge-sort.pl [OPTIONS] SOURCE\n\n"; 180 print "huge-sort.pl takes a file created by huge-count.pl --tokenlist\n"; 181 print "(or count.pl --tokenlist) as input, and determines the frequency\n"; 182 print "of each unique bigram. These bigrams are displayed in alphabetical order.\n"; 183 184 print "OPTIONS:\n\n"; 185 186 print " --keep keep the unsorted file\n"; 187 print " The default is to delete the unsorted file. \n\n"; 188 189 print " --help Prints this help message.\n\n"; 190 print " --version Prints this version message.\n\n"; 191} 192 193# function to output the version number 194sub showVersion 195{ 196 print STDERR 'huge-sort.pl $Id: huge-sort.pl,v 1.10 2011/03/31 23:04:04 tpederse Exp $'; 197 print STDERR "\nCopyright (C) 2009-2011, Ying Liu\n"; 198 199} 200 201 202