1#!/usr/local/bin/perl
2
3#*****************************************************************************
4# IrstLM: IRST Language Model Toolkit
5# Copyright (C) 2010 Marcello Federico, FBK-irst Trento, Italy
6
7# This library is free software; you can redistribute it and/or
8# modify it under the terms of the GNU Lesser General Public
9# License as published by the Free Software Foundation; either
10# version 2.1 of the License, or (at your option) any later version.
11
12# This library is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15# Lesser General Public License for more details.
16
17# You should have received a copy of the GNU Lesser General Public
18# License along with this library; if not, write to the Free Software
19# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
20
21#******************************************************************************
22#Sorts n-grams of an ARPA file according to lexicographic order.
23#Inverted sorting option is propedeutic to building a binary
24#lmtable with compile-lm with n-grams stored in reverted order.
25
26use strict;
27use Getopt::Long "GetOptions";
28use File::Basename;
29
30my ($help,$ilm,$olm,$inv)=();
31$help=1 unless
32
33$ilm="/dev/stdin";
34$olm="/dev/stdout";
35
36&GetOptions('ilm=s' => \$ilm,
37			'olm=s' => \$olm,
38            'inv' => \$inv,
39            'h|help' => \$help,);
40
41if ($help || !$ilm || !$olm) {
42	my $cmnd = basename($0);
43  print "\n$cmnd - sorts n-grams according to lexicographic order\n",
44	"\nUSAGE:\n",
45	"       $cmnd [options]\n",
46	"\nDESCRIPTION:\n",
47	"       $cmnd sorts n-grams of an ARPA file according to lexicographic order.\n",
48	"       Inverted sorting option is propedeutic to building a binary\n",
49	"       lmtable with compile-lm with n-grams stored in reverted order.\n",
50	"\nOPTIONS:\n",
51    "       -ilm  <fname>         input ARPA LM filename (default /dev/stdin) \n",
52    "       -olm <fname>          output ARPA LM filename (default /dev/stdout)\n",
53    "       -inv                  inverted n-gram sort for compile-lm \n",
54    "       -h, --help            (optional) print these instructions\n",
55    "\n";
56
57  exit(1);
58}
59
60
61my $order=0;
62my $sortcmd="";
63
64$ENV{'LC_ALL'}='C';
65
66open (INP, "< $ilm") || die "cannot open input LM file: $ilm\n";
67open (OUT, "> $olm") || die "cannot open output LM file: $olm\n";
68
69
70warn "reading from standard input\n" if $ilm eq "/dev/stdin";
71warn "writing to standard output\n" if $olm eq "/dev/stdout";
72
73$_=<INP>;
74
75#sanity check
76die "Error: input cannot be an intermediate iARPA file. First convert it to ARPA format with compile-lm.\n" if
77$_=~/^iARPA/;
78
79my $isQuantized=0;
80$isQuantized=1 if $_=~/^qARPA/;
81
82while(!/^\\end\\/){
83
84
85	if (($order)=$_=~/^\\(\d+)-grams:/){
86		print(OUT $_);$_=<INP>;
87		if ($isQuantized){
88			print(OUT $_); chop $_;#print centers
89			my $centers=$_; $_=<INP>;
90			warn "skip $centers centers\n";
91			for (my $c=1;$c<=$centers;$c++){
92				print(OUT $_);$_=<INP>;
93			}
94
95		}
96		#sort command
97		#$sortcmd="sort -b"; #does not seem to work properly
98		$sortcmd="sort ";
99		if ($inv){
100			warn "inverted sorting of $order-grams\n";
101			for (my $n=$order;$n>0;$n--){
102				$sortcmd.=" -k ".($n+1).",".($n+1);
103			}
104		}else{
105			warn "direct sorting of $order-grams\n";
106			for (my $n=1;$n<=$order;$n++){
107				$sortcmd.=" -k ".($n+1).",".($n+1);
108			}
109		}
110
111		close(OUT);open (OUT,"|$sortcmd >> $olm");
112
113
114		do{
115			print(OUT $_);$_=<INP>;
116
117		}until (/^\\/ || /^\n/);
118
119		close(OUT); open(OUT, ">> $olm");
120
121	}
122	else{
123		print(OUT $_);$_=<INP>;
124	}
125
126}
127
128print(OUT $_);
129
130close(INP);
131close(OUT);
132