1 // compile.h
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Copyright 2005-2010 Google, Inc.
16 // Author: riley@google.com (Michael Riley)
17 //
18 // \file
19 // Class to to compile a binary Fst from textual input.
20 
21 #ifndef FST_SCRIPT_COMPILE_IMPL_H_
22 #define FST_SCRIPT_COMPILE_IMPL_H_
23 
24 #include <unordered_map>
25 using std::unordered_map;
26 using std::unordered_multimap;
27 #include <sstream>
28 #include <string>
29 #include <vector>
30 using std::vector;
31 
32 #include <iostream>
33 #include <fstream>
34 #include <sstream>
35 #include <fst/fst.h>
36 #include <fst/util.h>
37 #include <fst/vector-fst.h>
38 
39 DECLARE_string(fst_field_separator);
40 
41 namespace fst {
42 
43 // Compile a binary Fst from textual input, helper class for fstcompile.cc
44 // WARNING: Stand-alone use of this class not recommended, most code should
45 // read/write using the binary format which is much more efficient.
46 template <class A> class FstCompiler {
47  public:
48   typedef A Arc;
49   typedef typename A::StateId StateId;
50   typedef typename A::Label Label;
51   typedef typename A::Weight Weight;
52 
53   // WARNING: use of 'allow_negative_labels = true' not recommended; may
54   // cause conflicts
55   // If add_symbols_ is true, then the symbols will be dynamically added
56   // to the symbol tables.  This is only useful if you set the (i/o)keep flag
57   // to attach the final symbol table, or use the accessors.  (The input
58   // symbol tables are const and therefore not changed.)
59   FstCompiler(istream &istrm, const string &source,  // NOLINT
60               const SymbolTable *isyms, const SymbolTable *osyms,
61               const SymbolTable *ssyms, bool accep, bool ikeep,
62               bool okeep, bool nkeep, bool allow_negative_labels = false) {
63     SymbolTable* misyms = isyms ? isyms->Copy() : NULL;
64     SymbolTable* mosyms = osyms ? osyms->Copy() : NULL;
65     SymbolTable* mssyms = ssyms ? ssyms->Copy() : NULL;
66     Init(istrm, source, misyms, mosyms, mssyms, accep, ikeep, okeep, nkeep,
67          allow_negative_labels, false);
68     delete mssyms;
69     delete mosyms;
70     delete misyms;
71   }
72 
FstCompiler(istream & istrm,const string & source,SymbolTable * isyms,SymbolTable * osyms,SymbolTable * ssyms,bool accep,bool ikeep,bool okeep,bool nkeep,bool allow_negative_labels,bool add_symbols)73   FstCompiler(istream &istrm, const string &source,  // NOLINT
74               SymbolTable *isyms, SymbolTable *osyms,
75               SymbolTable *ssyms, bool accep, bool ikeep,
76               bool okeep, bool nkeep, bool allow_negative_labels,
77               bool add_symbols) {
78     Init(istrm, source, isyms, osyms, ssyms, accep, ikeep, okeep, nkeep,
79          allow_negative_labels, add_symbols);
80   }
81 
Init(istream & istrm,const string & source,SymbolTable * isyms,SymbolTable * osyms,SymbolTable * ssyms,bool accep,bool ikeep,bool okeep,bool nkeep,bool allow_negative_labels,bool add_symbols)82   void Init(istream &istrm, const string &source, SymbolTable *isyms,  // NOLINT
83             SymbolTable *osyms, SymbolTable *ssyms, bool accep, bool ikeep,
84             bool okeep, bool nkeep, bool allow_negative_labels,
85             bool add_symbols) {
86     nline_ = 0;
87     source_ = source;
88     isyms_ = isyms;
89     osyms_ = osyms;
90     ssyms_ = ssyms;
91     nstates_ = 0;
92     keep_state_numbering_ = nkeep;
93     allow_negative_labels_ = allow_negative_labels;
94     add_symbols_ = add_symbols;
95     char line[kLineLen];
96     while (istrm.getline(line, kLineLen)) {
97       ++nline_;
98       vector<char *> col;
99       string separator = FLAGS_fst_field_separator + "\n";
100       SplitToVector(line, separator.c_str(), &col, true);
101       if (col.size() == 0 || col[0][0] == '\0')  // empty line
102         continue;
103       if (col.size() > 5 ||
104           (col.size() > 4 && accep) ||
105           (col.size() == 3 && !accep)) {
106         FSTERROR() << "FstCompiler: Bad number of columns, source = "
107                    << source_
108                    << ", line = " << nline_;
109         fst_.SetProperties(kError, kError);
110         return;
111       }
112       StateId s = StrToStateId(col[0]);
113       while (s >= fst_.NumStates())
114         fst_.AddState();
115       if (nline_ == 1)
116         fst_.SetStart(s);
117 
118       Arc arc;
119       StateId d = s;
120       switch (col.size()) {
121       case 1:
122         fst_.SetFinal(s, Weight::One());
123         break;
124       case 2:
125         fst_.SetFinal(s, StrToWeight(col[1], true));
126         break;
127       case 3:
128         arc.nextstate = d = StrToStateId(col[1]);
129         arc.ilabel = StrToILabel(col[2]);
130         arc.olabel = arc.ilabel;
131         arc.weight = Weight::One();
132         fst_.AddArc(s, arc);
133         break;
134       case 4:
135         arc.nextstate = d = StrToStateId(col[1]);
136         arc.ilabel = StrToILabel(col[2]);
137         if (accep) {
138           arc.olabel = arc.ilabel;
139           arc.weight = StrToWeight(col[3], true);
140         } else {
141           arc.olabel = StrToOLabel(col[3]);
142           arc.weight = Weight::One();
143         }
144         fst_.AddArc(s, arc);
145         break;
146       case 5:
147         arc.nextstate = d = StrToStateId(col[1]);
148         arc.ilabel = StrToILabel(col[2]);
149         arc.olabel = StrToOLabel(col[3]);
150         arc.weight = StrToWeight(col[4], true);
151         fst_.AddArc(s, arc);
152       }
153       while (d >= fst_.NumStates())
154         fst_.AddState();
155     }
156     if (ikeep)
157       fst_.SetInputSymbols(isyms);
158     if (okeep)
159       fst_.SetOutputSymbols(osyms);
160   }
161 
Fst()162   const VectorFst<A> &Fst() const {
163     return fst_;
164   }
165 
166  private:
167   // Maximum line length in text file.
168   static const int kLineLen = 8096;
169 
170   int64 StrToId(const char *s, SymbolTable *syms,
171                 const char *name, bool allow_negative = false) const {
172     int64 n = 0;
173 
174     if (syms) {
175       n = (add_symbols_) ? syms->AddSymbol(s) : syms->Find(s);
176       if (n == -1 || (!allow_negative && n < 0)) {
177         FSTERROR() << "FstCompiler: Symbol \"" << s
178                    << "\" is not mapped to any integer " << name
179                    << ", symbol table = " << syms->Name()
180                    << ", source = " << source_ << ", line = " << nline_;
181         fst_.SetProperties(kError, kError);
182       }
183     } else {
184       char *p;
185       n = strtoll(s, &p, 10);
186       if (p < s + strlen(s) || (!allow_negative && n < 0)) {
187         FSTERROR() << "FstCompiler: Bad " << name << " integer = \"" << s
188                    << "\", source = " << source_ << ", line = " << nline_;
189         fst_.SetProperties(kError, kError);
190       }
191     }
192     return n;
193   }
194 
StrToStateId(const char * s)195   StateId StrToStateId(const char *s) {
196     StateId n = StrToId(s, ssyms_, "state ID");
197 
198     if (keep_state_numbering_)
199       return n;
200 
201     // remap state IDs to make dense set
202     typename unordered_map<StateId, StateId>::const_iterator it = states_.find(n);
203     if (it == states_.end()) {
204       states_[n] = nstates_;
205       return nstates_++;
206     } else {
207       return it->second;
208     }
209   }
210 
StrToILabel(const char * s)211   StateId StrToILabel(const char *s) const {
212     return StrToId(s, isyms_, "arc ilabel", allow_negative_labels_);
213   }
214 
StrToOLabel(const char * s)215   StateId StrToOLabel(const char *s) const {
216     return StrToId(s, osyms_, "arc olabel", allow_negative_labels_);
217   }
218 
StrToWeight(const char * s,bool allow_zero)219   Weight StrToWeight(const char *s, bool allow_zero) const {
220     Weight w;
221     istringstream strm(s);
222     strm >> w;
223     if (!strm || (!allow_zero && w == Weight::Zero())) {
224       FSTERROR() << "FstCompiler: Bad weight = \"" << s
225                  << "\", source = " << source_ << ", line = " << nline_;
226       fst_.SetProperties(kError, kError);
227       w = Weight::NoWeight();
228     }
229     return w;
230   }
231 
232   mutable VectorFst<A> fst_;
233   size_t nline_;
234   string source_;                      // text FST source name
235   SymbolTable *isyms_;           // ilabel symbol table (not owned)
236   SymbolTable *osyms_;           // olabel symbol table (not owned)
237   SymbolTable *ssyms_;           // slabel symbol table (not owned)
238   unordered_map<StateId, StateId> states_;  // state ID map
239   StateId nstates_;                    // number of seen states
240   bool keep_state_numbering_;
241   bool allow_negative_labels_;         // not recommended; may cause conflicts
242   bool add_symbols_;         // add to symbol tables on-the fly
243 
244   DISALLOW_COPY_AND_ASSIGN(FstCompiler);
245 };
246 
247 }  // namespace fst
248 
249 #endif  // FST_SCRIPT_COMPILE_IMPL_H_
250