1 // util.h
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Copyright 2005-2010 Google, Inc.
16 // Author: riley@google.com (Michael Riley)
17 //
18 // \file
19 // FST utility inline definitions.
20 
21 #ifndef FST_LIB_UTIL_H__
22 #define FST_LIB_UTIL_H__
23 
24 #include <unordered_map>
25 using std::unordered_map;
26 using std::unordered_multimap;
27 #include <unordered_set>
28 using std::unordered_set;
29 using std::unordered_multiset;
30 #include <list>
31 #include <map>
32 #include <set>
33 #include <sstream>
34 #include <string>
35 #include <utility>
36 using std::pair; using std::make_pair;
37 #include <vector>
38 using std::vector;
39 
40 
41 #include <fst/compat.h>
42 #include <fst/types.h>
43 
44 #include <iostream>
45 #include <fstream>
46 #include <sstream>
47 
48 
49 //
50 // UTILITY FOR ERROR HANDLING
51 //
52 
53 DECLARE_bool(fst_error_fatal);
54 
55 #define FSTERROR() (FLAGS_fst_error_fatal ? LOG(FATAL) : LOG(ERROR))
56 
57 namespace fst {
58 
59 //
60 // UTILITIES FOR TYPE I/O
61 //
62 
63 // Read some types from an input stream.
64 
65 // Generic case.
66 template <typename T>
ReadType(istream & strm,T * t)67 inline istream &ReadType(istream &strm, T *t) {
68   return t->Read(strm);
69 }
70 
71 // Fixed size, contiguous memory read.
72 #define READ_POD_TYPE(T)                                    \
73 inline istream &ReadType(istream &strm, T *t) {             \
74   return strm.read(reinterpret_cast<char *>(t), sizeof(T)); \
75 }
76 
77 READ_POD_TYPE(bool);
78 READ_POD_TYPE(char);
79 READ_POD_TYPE(signed char);
80 READ_POD_TYPE(unsigned char);
81 READ_POD_TYPE(short);
82 READ_POD_TYPE(unsigned short);
83 READ_POD_TYPE(int);
84 READ_POD_TYPE(unsigned int);
85 READ_POD_TYPE(long);
86 READ_POD_TYPE(unsigned long);
87 READ_POD_TYPE(long long);
88 READ_POD_TYPE(unsigned long long);
89 READ_POD_TYPE(float);
90 READ_POD_TYPE(double);
91 
92 // String case.
ReadType(istream & strm,string * s)93 inline istream &ReadType(istream &strm, string *s) {  // NOLINT
94   s->clear();
95   int32 ns = 0;
96   strm.read(reinterpret_cast<char *>(&ns), sizeof(ns));
97   for (int i = 0; i < ns; ++i) {
98     char c;
99     strm.read(&c, 1);
100     *s += c;
101   }
102   return strm;
103 }
104 
105 // Declare some types that write to an output stream
106 
107 #define DECL_READ_TYPE2(C)                                                  \
108 template <typename S, typename T>                                           \
109 inline istream &ReadType(istream &strm, C<S, T> *c)  // NOLINT
110 
111 #define DECL_READ_TYPE3(C)                                                  \
112 template <typename S, typename T, typename U>                               \
113 inline istream &ReadType(istream &strm, C<S, T, U> *c)
114 
115 DECL_READ_TYPE2(vector);
116 DECL_READ_TYPE2(list);
117 
118 DECL_READ_TYPE3(set);
119 DECL_READ_TYPE3(unordered_set);
120 DECL_READ_TYPE3(map);
121 DECL_READ_TYPE3(unordered_map);
122 
123 // Pair case.
124 template <typename S, typename T>
ReadType(istream & strm,pair<S,T> * p)125 inline istream &ReadType(istream &strm, pair<S, T> *p) {
126   ReadType(strm, &p->first);
127   ReadType(strm, &p->second);
128   return strm;
129 }
130 
131 template <typename S, typename T>
ReadType(istream & strm,pair<const S,T> * p)132 inline istream &ReadType(istream &strm, pair<const S, T> *p) {
133   ReadType(strm, const_cast<S *>(&p->first));
134   ReadType(strm, &p->second);
135   return strm;
136 }
137 
138 // General case - no-op.
139 template <typename C>
StlReserve(C * c,int64 n)140 void StlReserve(C *c, int64 n) {}
141 
142 // Specialization for vectors.
143 template <typename S, typename T>
StlReserve(vector<S,T> * c,int64 n)144 void StlReserve(vector<S, T> *c, int64 n) {
145   c->reserve(n);
146 }
147 
148 // STL sequence container.
149 #define READ_STL_SEQ_TYPE(C)                             \
150 template <typename S, typename T>                        \
151 inline istream &ReadType(istream &strm, C<S, T> *c) {    \
152   c->clear();                                            \
153   int64 n = 0;                                           \
154   strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
155   StlReserve(c, n);                                      \
156   for (ssize_t i = 0; i < n; ++i) {                      \
157     typename C<S, T>::value_type value;                  \
158     ReadType(strm, &value);                              \
159     c->insert(c->end(), value);                          \
160   }                                                      \
161   return strm;                                           \
162 }
163 
164 READ_STL_SEQ_TYPE(vector);
165 READ_STL_SEQ_TYPE(list);
166 
167 // STL associative container.
168 #define READ_STL_ASSOC_TYPE(C)                           \
169 template <typename S, typename T, typename U>            \
170 inline istream &ReadType(istream &strm, C<S, T, U> *c) { \
171   c->clear();                                            \
172   int64 n = 0;                                           \
173   strm.read(reinterpret_cast<char *>(&n), sizeof(n));    \
174   for (ssize_t i = 0; i < n; ++i) {                      \
175     typename C<S, T, U>::value_type value;               \
176     ReadType(strm, &value);                              \
177     c->insert(value);                                    \
178   }                                                      \
179   return strm;                                           \
180 }
181 
182 READ_STL_ASSOC_TYPE(set);
183 READ_STL_ASSOC_TYPE(unordered_set);
184 READ_STL_ASSOC_TYPE(map);
185 READ_STL_ASSOC_TYPE(unordered_map);
186 
187 // Write some types to an output stream.
188 
189 // Generic case.
190 template <typename T>
WriteType(ostream & strm,const T t)191 inline ostream &WriteType(ostream &strm, const T t) {
192   t.Write(strm);
193   return strm;
194 }
195 
196 // Fixed size, contiguous memory write.
197 #define WRITE_POD_TYPE(T)                                           \
198 inline ostream &WriteType(ostream &strm, const T t) {               \
199   return strm.write(reinterpret_cast<const char *>(&t), sizeof(T)); \
200 }
201 
202 WRITE_POD_TYPE(bool);
203 WRITE_POD_TYPE(char);
204 WRITE_POD_TYPE(signed char);
205 WRITE_POD_TYPE(unsigned char);
206 WRITE_POD_TYPE(short);
207 WRITE_POD_TYPE(unsigned short);
208 WRITE_POD_TYPE(int);
209 WRITE_POD_TYPE(unsigned int);
210 WRITE_POD_TYPE(long);
211 WRITE_POD_TYPE(unsigned long);
212 WRITE_POD_TYPE(long long);
213 WRITE_POD_TYPE(unsigned long long);
214 WRITE_POD_TYPE(float);
215 WRITE_POD_TYPE(double);
216 
217 // String case.
WriteType(ostream & strm,const string & s)218 inline ostream &WriteType(ostream &strm, const string &s) {
219   int32 ns = s.size();
220   strm.write(reinterpret_cast<const char *>(&ns), sizeof(ns));
221   return strm.write(s.data(), ns);
222 }
223 
224 // Declare some types that write to an output stream
225 
226 #define DECL_WRITE_TYPE2(C)                                                  \
227 template <typename S, typename T>                                            \
228 inline ostream &WriteType(ostream &strm, const C<S, T> &c)
229 
230 #define DECL_WRITE_TYPE3(C)                                                  \
231 template <typename S, typename T, typename U>                                \
232 inline ostream &WriteType(ostream &strm, const C<S, T, U> &c)  // NOLINT
233 
234 DECL_WRITE_TYPE2(vector);
235 DECL_WRITE_TYPE2(list);
236 
237 DECL_WRITE_TYPE3(set);
238 DECL_WRITE_TYPE3(unordered_set);
239 DECL_WRITE_TYPE3(map);
240 DECL_WRITE_TYPE3(unordered_map);
241 
242 // Pair case.
243 template <typename S, typename T>
WriteType(ostream & strm,const pair<S,T> & p)244 inline ostream &WriteType(ostream &strm, const pair<S, T> &p) {  // NOLINT
245   WriteType(strm, p.first);
246   WriteType(strm, p.second);
247   return strm;
248 }
249 
250 // STL sequence container.
251 #define WRITE_STL_SEQ_TYPE(C)                                                \
252 template <typename S, typename T>                                            \
253 inline ostream &WriteType(ostream &strm, const C<S, T> &c) {                 \
254   int64 n = c.size();                                                        \
255   strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
256   for (typename C<S, T>::const_iterator it = c.begin();                      \
257        it != c.end(); ++it)                                                  \
258      WriteType(strm, *it);                                                   \
259   return strm;                                                               \
260 }
261 
262 WRITE_STL_SEQ_TYPE(vector);
263 WRITE_STL_SEQ_TYPE(list);
264 
265 // STL associative container.
266 #define WRITE_STL_ASSOC_TYPE(C)                                              \
267 template <typename S, typename T, typename U>                                \
268 inline ostream &WriteType(ostream &strm, const C<S, T, U> &c) {              \
269   int64 n = c.size();                                                        \
270   strm.write(reinterpret_cast<char *>(&n), sizeof(n));                       \
271   for (typename C<S, T, U>::const_iterator it = c.begin();                   \
272        it != c.end(); ++it)                                                  \
273      WriteType(strm, *it);                                                   \
274   return strm;                                                               \
275 }
276 
277 WRITE_STL_ASSOC_TYPE(set);
278 WRITE_STL_ASSOC_TYPE(unordered_set);
279 WRITE_STL_ASSOC_TYPE(map);
280 WRITE_STL_ASSOC_TYPE(unordered_map);
281 
282 // Utilities for converting between int64 or Weight and string.
283 
284 int64 StrToInt64(const string &s, const string &src, size_t nline,
285                  bool allow_negative, bool *error = 0);
286 
287 template <typename Weight>
StrToWeight(const string & s,const string & src,size_t nline)288 Weight StrToWeight(const string &s, const string &src, size_t nline) {
289   Weight w;
290   istringstream strm(s);
291   strm >> w;
292   if (!strm) {
293     FSTERROR() << "StrToWeight: Bad weight = \"" << s
294                << "\", source = " << src << ", line = " << nline;
295     return Weight::NoWeight();
296   }
297   return w;
298 }
299 
300 void Int64ToStr(int64 n, string *s);
301 
302 template <typename Weight>
WeightToStr(Weight w,string * s)303 void WeightToStr(Weight w, string *s) {
304   ostringstream strm;
305   strm.precision(9);
306   strm << w;
307   s->append(strm.str().data(), strm.str().size());
308 }
309 
310 // Utilities for reading/writing integer pairs (typically labels)
311 
312 // Modifies line, vec consists of pointers to of buffer beginning
313 // with line.
314 void SplitToVector(char *line, const char *delim,
315                    vector<char *> *vec, bool omit_empty_strings);
316 
317 // Returns true on success
318 template <typename I>
319 bool ReadIntPairs(const string& filename,
320                     vector<pair<I, I> >* pairs,
321                     bool allow_negative = false) {
322   ifstream strm(filename.c_str());
323 
324   if (!strm) {
325     LOG(ERROR) << "ReadIntPairs: Can't open file: " << filename;
326     return false;
327   }
328 
329   const int kLineLen = 8096;
330   char line[kLineLen];
331   size_t nline = 0;
332 
333   pairs->clear();
334   while (strm.getline(line, kLineLen)) {
335     ++nline;
336     vector<char *> col;
337     SplitToVector(line, "\n\t ", &col, true);
338     // empty line or comment?
339     if (col.size() == 0 || col[0][0] == '\0' || col[0][0] == '#')
340       continue;
341     if (col.size() != 2) {
342       LOG(ERROR) << "ReadIntPairs: Bad number of columns, "
343                  << "file = " << filename << ", line = " << nline;
344       return false;
345     }
346 
347     bool err;
348     I i1 = StrToInt64(col[0], filename, nline, allow_negative, &err);
349     if (err) return false;
350     I i2 = StrToInt64(col[1], filename, nline, allow_negative, &err);
351     if (err) return false;
352     pairs->push_back(make_pair(i1, i2));
353   }
354   return true;
355 }
356 
357 // Returns true on success
358 template <typename I>
WriteIntPairs(const string & filename,const vector<pair<I,I>> & pairs)359 bool WriteIntPairs(const string& filename,
360                    const vector<pair<I, I> >& pairs) {
361   ostream *strm = &cout;
362   if (!filename.empty()) {
363     strm = new ofstream(filename.c_str());
364     if (!*strm) {
365       LOG(ERROR) << "WriteIntPairs: Can't open file: " << filename;
366       return false;
367     }
368   }
369 
370   for (ssize_t n = 0; n < pairs.size(); ++n)
371     *strm << pairs[n].first << "\t" << pairs[n].second << "\n";
372 
373   if (!*strm) {
374     LOG(ERROR) << "WriteIntPairs: Write failed: "
375                << (filename.empty() ? "standard output" : filename);
376     return false;
377   }
378   if (strm != &cout)
379     delete strm;
380   return true;
381 }
382 
383 // Utilities for reading/writing label pairs
384 
385 template <typename Label>
386 bool ReadLabelPairs(const string& filename,
387                     vector<pair<Label, Label> >* pairs,
388                     bool allow_negative = false) {
389   return ReadIntPairs(filename, pairs, allow_negative);
390 }
391 
392 template <typename Label>
WriteLabelPairs(const string & filename,vector<pair<Label,Label>> & pairs)393 bool WriteLabelPairs(const string& filename,
394                      vector<pair<Label, Label> >& pairs) {
395   return WriteIntPairs(filename, pairs);
396 }
397 
398 // Utilities for converting a type name to a legal C symbol.
399 
400 void ConvertToLegalCSymbol(string *s);
401 
402 
403 //
404 // UTILITIES FOR STREAM I/O
405 //
406 
407 bool AlignInput(istream &strm);
408 bool AlignOutput(ostream &strm);
409 
410 //
411 // UTILITIES FOR PROTOCOL BUFFER I/O
412 //
413 
414 
415 // An associative container for which testing membership is
416 // faster than an STL set if members are restricted to an interval
417 // that excludes most non-members. A 'Key' must have ==, !=, and < defined.
418 // Element 'NoKey' should be a key that marks an uninitialized key and
419 // is otherwise unused. 'Find()' returns an STL const_iterator to the match
420 // found, otherwise it equals 'End()'.
421 template <class Key, Key NoKey>
422 class CompactSet {
423 public:
424   typedef typename set<Key>::const_iterator const_iterator;
425 
CompactSet()426   CompactSet()
427     : min_key_(NoKey),
428       max_key_(NoKey) { }
429 
CompactSet(const CompactSet<Key,NoKey> & compact_set)430   CompactSet(const CompactSet<Key, NoKey> &compact_set)
431     : set_(compact_set.set_),
432       min_key_(compact_set.min_key_),
433       max_key_(compact_set.max_key_) { }
434 
Insert(Key key)435   void Insert(Key key) {
436     set_.insert(key);
437     if (min_key_ == NoKey || key < min_key_)
438       min_key_ = key;
439     if (max_key_ == NoKey || max_key_ < key)
440         max_key_ = key;
441   }
442 
Erase(Key key)443   void Erase(Key key) {
444     set_.erase(key);
445     if (set_.empty()) {
446         min_key_ = max_key_ = NoKey;
447     } else if (key == min_key_) {
448       ++min_key_;
449     } else if (key == max_key_) {
450       --max_key_;
451     }
452   }
453 
Clear()454   void Clear() {
455     set_.clear();
456     min_key_ = max_key_ = NoKey;
457   }
458 
Find(Key key)459   const_iterator Find(Key key) const {
460     if (min_key_ == NoKey ||
461         key < min_key_ || max_key_ < key)
462       return set_.end();
463     else
464       return set_.find(key);
465   }
466 
Member(Key key)467   bool Member(Key key) const {
468     if (min_key_ == NoKey || key < min_key_ || max_key_ < key) {
469       return false;   // out of range
470     } else if (min_key_ != NoKey && max_key_ + 1 == min_key_ + set_.size()) {
471       return true;    // dense range
472     } else {
473       return set_.find(key) != set_.end();
474     }
475   }
476 
Begin()477   const_iterator Begin() const { return set_.begin(); }
478 
End()479   const_iterator End() const { return set_.end(); }
480 
481   // All stored keys are greater than or equal to this value.
LowerBound()482   Key LowerBound() const { return min_key_; }
483 
484   // All stored keys are less than or equal to this value.
UpperBound()485   Key UpperBound() const { return max_key_; }
486 
487 private:
488   set<Key> set_;
489   Key min_key_;
490   Key max_key_;
491 
492   void operator=(const CompactSet<Key, NoKey> &);  //disallow
493 };
494 
495 }  // namespace fst
496 
497 #endif  // FST_LIB_UTIL_H__
498