1 // compile.h 2 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Copyright 2005-2010 Google, Inc. 16 // Author: riley@google.com (Michael Riley) 17 // 18 // \file 19 // Class to to compile a binary Fst from textual input. 20 21 #ifndef FST_SCRIPT_COMPILE_IMPL_H_ 22 #define FST_SCRIPT_COMPILE_IMPL_H_ 23 24 #include <unordered_map> 25 using std::unordered_map; 26 using std::unordered_multimap; 27 #include <sstream> 28 #include <string> 29 #include <vector> 30 using std::vector; 31 32 #include <iostream> 33 #include <fstream> 34 #include <sstream> 35 #include <fst/fst.h> 36 #include <fst/util.h> 37 #include <fst/vector-fst.h> 38 39 DECLARE_string(fst_field_separator); 40 41 namespace fst { 42 43 // Compile a binary Fst from textual input, helper class for fstcompile.cc 44 // WARNING: Stand-alone use of this class not recommended, most code should 45 // read/write using the binary format which is much more efficient. 46 template <class A> class FstCompiler { 47 public: 48 typedef A Arc; 49 typedef typename A::StateId StateId; 50 typedef typename A::Label Label; 51 typedef typename A::Weight Weight; 52 53 // WARNING: use of 'allow_negative_labels = true' not recommended; may 54 // cause conflicts 55 // If add_symbols_ is true, then the symbols will be dynamically added 56 // to the symbol tables. This is only useful if you set the (i/o)keep flag 57 // to attach the final symbol table, or use the accessors. (The input 58 // symbol tables are const and therefore not changed.) 59 FstCompiler(istream &istrm, const string &source, // NOLINT 60 const SymbolTable *isyms, const SymbolTable *osyms, 61 const SymbolTable *ssyms, bool accep, bool ikeep, 62 bool okeep, bool nkeep, bool allow_negative_labels = false) { 63 SymbolTable* misyms = isyms ? isyms->Copy() : NULL; 64 SymbolTable* mosyms = osyms ? osyms->Copy() : NULL; 65 SymbolTable* mssyms = ssyms ? ssyms->Copy() : NULL; 66 Init(istrm, source, misyms, mosyms, mssyms, accep, ikeep, okeep, nkeep, 67 allow_negative_labels, false); 68 delete mssyms; 69 delete mosyms; 70 delete misyms; 71 } 72 FstCompiler(istream & istrm,const string & source,SymbolTable * isyms,SymbolTable * osyms,SymbolTable * ssyms,bool accep,bool ikeep,bool okeep,bool nkeep,bool allow_negative_labels,bool add_symbols)73 FstCompiler(istream &istrm, const string &source, // NOLINT 74 SymbolTable *isyms, SymbolTable *osyms, 75 SymbolTable *ssyms, bool accep, bool ikeep, 76 bool okeep, bool nkeep, bool allow_negative_labels, 77 bool add_symbols) { 78 Init(istrm, source, isyms, osyms, ssyms, accep, ikeep, okeep, nkeep, 79 allow_negative_labels, add_symbols); 80 } 81 Init(istream & istrm,const string & source,SymbolTable * isyms,SymbolTable * osyms,SymbolTable * ssyms,bool accep,bool ikeep,bool okeep,bool nkeep,bool allow_negative_labels,bool add_symbols)82 void Init(istream &istrm, const string &source, SymbolTable *isyms, // NOLINT 83 SymbolTable *osyms, SymbolTable *ssyms, bool accep, bool ikeep, 84 bool okeep, bool nkeep, bool allow_negative_labels, 85 bool add_symbols) { 86 nline_ = 0; 87 source_ = source; 88 isyms_ = isyms; 89 osyms_ = osyms; 90 ssyms_ = ssyms; 91 nstates_ = 0; 92 keep_state_numbering_ = nkeep; 93 allow_negative_labels_ = allow_negative_labels; 94 add_symbols_ = add_symbols; 95 char line[kLineLen]; 96 while (istrm.getline(line, kLineLen)) { 97 ++nline_; 98 vector<char *> col; 99 string separator = FLAGS_fst_field_separator + "\n"; 100 SplitToVector(line, separator.c_str(), &col, true); 101 if (col.size() == 0 || col[0][0] == '\0') // empty line 102 continue; 103 if (col.size() > 5 || 104 (col.size() > 4 && accep) || 105 (col.size() == 3 && !accep)) { 106 FSTERROR() << "FstCompiler: Bad number of columns, source = " 107 << source_ 108 << ", line = " << nline_; 109 fst_.SetProperties(kError, kError); 110 return; 111 } 112 StateId s = StrToStateId(col[0]); 113 while (s >= fst_.NumStates()) 114 fst_.AddState(); 115 if (nline_ == 1) 116 fst_.SetStart(s); 117 118 Arc arc; 119 StateId d = s; 120 switch (col.size()) { 121 case 1: 122 fst_.SetFinal(s, Weight::One()); 123 break; 124 case 2: 125 fst_.SetFinal(s, StrToWeight(col[1], true)); 126 break; 127 case 3: 128 arc.nextstate = d = StrToStateId(col[1]); 129 arc.ilabel = StrToILabel(col[2]); 130 arc.olabel = arc.ilabel; 131 arc.weight = Weight::One(); 132 fst_.AddArc(s, arc); 133 break; 134 case 4: 135 arc.nextstate = d = StrToStateId(col[1]); 136 arc.ilabel = StrToILabel(col[2]); 137 if (accep) { 138 arc.olabel = arc.ilabel; 139 arc.weight = StrToWeight(col[3], true); 140 } else { 141 arc.olabel = StrToOLabel(col[3]); 142 arc.weight = Weight::One(); 143 } 144 fst_.AddArc(s, arc); 145 break; 146 case 5: 147 arc.nextstate = d = StrToStateId(col[1]); 148 arc.ilabel = StrToILabel(col[2]); 149 arc.olabel = StrToOLabel(col[3]); 150 arc.weight = StrToWeight(col[4], true); 151 fst_.AddArc(s, arc); 152 } 153 while (d >= fst_.NumStates()) 154 fst_.AddState(); 155 } 156 if (ikeep) 157 fst_.SetInputSymbols(isyms); 158 if (okeep) 159 fst_.SetOutputSymbols(osyms); 160 } 161 Fst()162 const VectorFst<A> &Fst() const { 163 return fst_; 164 } 165 166 private: 167 // Maximum line length in text file. 168 static const int kLineLen = 8096; 169 170 int64 StrToId(const char *s, SymbolTable *syms, 171 const char *name, bool allow_negative = false) const { 172 int64 n = 0; 173 174 if (syms) { 175 n = (add_symbols_) ? syms->AddSymbol(s) : syms->Find(s); 176 if (n == -1 || (!allow_negative && n < 0)) { 177 FSTERROR() << "FstCompiler: Symbol \"" << s 178 << "\" is not mapped to any integer " << name 179 << ", symbol table = " << syms->Name() 180 << ", source = " << source_ << ", line = " << nline_; 181 fst_.SetProperties(kError, kError); 182 } 183 } else { 184 char *p; 185 n = strtoll(s, &p, 10); 186 if (p < s + strlen(s) || (!allow_negative && n < 0)) { 187 FSTERROR() << "FstCompiler: Bad " << name << " integer = \"" << s 188 << "\", source = " << source_ << ", line = " << nline_; 189 fst_.SetProperties(kError, kError); 190 } 191 } 192 return n; 193 } 194 StrToStateId(const char * s)195 StateId StrToStateId(const char *s) { 196 StateId n = StrToId(s, ssyms_, "state ID"); 197 198 if (keep_state_numbering_) 199 return n; 200 201 // remap state IDs to make dense set 202 typename unordered_map<StateId, StateId>::const_iterator it = states_.find(n); 203 if (it == states_.end()) { 204 states_[n] = nstates_; 205 return nstates_++; 206 } else { 207 return it->second; 208 } 209 } 210 StrToILabel(const char * s)211 StateId StrToILabel(const char *s) const { 212 return StrToId(s, isyms_, "arc ilabel", allow_negative_labels_); 213 } 214 StrToOLabel(const char * s)215 StateId StrToOLabel(const char *s) const { 216 return StrToId(s, osyms_, "arc olabel", allow_negative_labels_); 217 } 218 StrToWeight(const char * s,bool allow_zero)219 Weight StrToWeight(const char *s, bool allow_zero) const { 220 Weight w; 221 istringstream strm(s); 222 strm >> w; 223 if (!strm || (!allow_zero && w == Weight::Zero())) { 224 FSTERROR() << "FstCompiler: Bad weight = \"" << s 225 << "\", source = " << source_ << ", line = " << nline_; 226 fst_.SetProperties(kError, kError); 227 w = Weight::NoWeight(); 228 } 229 return w; 230 } 231 232 mutable VectorFst<A> fst_; 233 size_t nline_; 234 string source_; // text FST source name 235 SymbolTable *isyms_; // ilabel symbol table (not owned) 236 SymbolTable *osyms_; // olabel symbol table (not owned) 237 SymbolTable *ssyms_; // slabel symbol table (not owned) 238 unordered_map<StateId, StateId> states_; // state ID map 239 StateId nstates_; // number of seen states 240 bool keep_state_numbering_; 241 bool allow_negative_labels_; // not recommended; may cause conflicts 242 bool add_symbols_; // add to symbol tables on-the fly 243 244 DISALLOW_COPY_AND_ASSIGN(FstCompiler); 245 }; 246 247 } // namespace fst 248 249 #endif // FST_SCRIPT_COMPILE_IMPL_H_ 250