1#!/usr/local/bin/perl 2 3#***************************************************************************** 4# IrstLM: IRST Language Model Toolkit 5# Copyright (C) 2010 Marcello Federico, FBK-irst Trento, Italy 6 7# This library is free software; you can redistribute it and/or 8# modify it under the terms of the GNU Lesser General Public 9# License as published by the Free Software Foundation; either 10# version 2.1 of the License, or (at your option) any later version. 11 12# This library is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15# Lesser General Public License for more details. 16 17# You should have received a copy of the GNU Lesser General Public 18# License along with this library; if not, write to the Free Software 19# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 21#****************************************************************************** 22#Sorts n-grams of an ARPA file according to lexicographic order. 23#Inverted sorting option is propedeutic to building a binary 24#lmtable with compile-lm with n-grams stored in reverted order. 25 26use strict; 27use Getopt::Long "GetOptions"; 28use File::Basename; 29 30my ($help,$ilm,$olm,$inv)=(); 31$help=1 unless 32 33$ilm="/dev/stdin"; 34$olm="/dev/stdout"; 35 36&GetOptions('ilm=s' => \$ilm, 37 'olm=s' => \$olm, 38 'inv' => \$inv, 39 'h|help' => \$help,); 40 41if ($help || !$ilm || !$olm) { 42 my $cmnd = basename($0); 43 print "\n$cmnd - sorts n-grams according to lexicographic order\n", 44 "\nUSAGE:\n", 45 " $cmnd [options]\n", 46 "\nDESCRIPTION:\n", 47 " $cmnd sorts n-grams of an ARPA file according to lexicographic order.\n", 48 " Inverted sorting option is propedeutic to building a binary\n", 49 " lmtable with compile-lm with n-grams stored in reverted order.\n", 50 "\nOPTIONS:\n", 51 " -ilm <fname> input ARPA LM filename (default /dev/stdin) \n", 52 " -olm <fname> output ARPA LM filename (default /dev/stdout)\n", 53 " -inv inverted n-gram sort for compile-lm \n", 54 " -h, --help (optional) print these instructions\n", 55 "\n"; 56 57 exit(1); 58} 59 60 61my $order=0; 62my $sortcmd=""; 63 64$ENV{'LC_ALL'}='C'; 65 66open (INP, "< $ilm") || die "cannot open input LM file: $ilm\n"; 67open (OUT, "> $olm") || die "cannot open output LM file: $olm\n"; 68 69 70warn "reading from standard input\n" if $ilm eq "/dev/stdin"; 71warn "writing to standard output\n" if $olm eq "/dev/stdout"; 72 73$_=<INP>; 74 75#sanity check 76die "Error: input cannot be an intermediate iARPA file. First convert it to ARPA format with compile-lm.\n" if 77$_=~/^iARPA/; 78 79my $isQuantized=0; 80$isQuantized=1 if $_=~/^qARPA/; 81 82while(!/^\\end\\/){ 83 84 85 if (($order)=$_=~/^\\(\d+)-grams:/){ 86 print(OUT $_);$_=<INP>; 87 if ($isQuantized){ 88 print(OUT $_); chop $_;#print centers 89 my $centers=$_; $_=<INP>; 90 warn "skip $centers centers\n"; 91 for (my $c=1;$c<=$centers;$c++){ 92 print(OUT $_);$_=<INP>; 93 } 94 95 } 96 #sort command 97 #$sortcmd="sort -b"; #does not seem to work properly 98 $sortcmd="sort "; 99 if ($inv){ 100 warn "inverted sorting of $order-grams\n"; 101 for (my $n=$order;$n>0;$n--){ 102 $sortcmd.=" -k ".($n+1).",".($n+1); 103 } 104 }else{ 105 warn "direct sorting of $order-grams\n"; 106 for (my $n=1;$n<=$order;$n++){ 107 $sortcmd.=" -k ".($n+1).",".($n+1); 108 } 109 } 110 111 close(OUT);open (OUT,"|$sortcmd >> $olm"); 112 113 114 do{ 115 print(OUT $_);$_=<INP>; 116 117 }until (/^\\/ || /^\n/); 118 119 close(OUT); open(OUT, ">> $olm"); 120 121 } 122 else{ 123 print(OUT $_);$_=<INP>; 124 } 125 126} 127 128print(OUT $_); 129 130close(INP); 131close(OUT); 132