1 // util.h
2
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Copyright 2005-2010 Google, Inc.
16 // Author: riley@google.com (Michael Riley)
17 //
18 // \file
19 // FST utility inline definitions.
20
21 #ifndef FST_LIB_UTIL_H__
22 #define FST_LIB_UTIL_H__
23
24 #include <unordered_map>
25 using std::unordered_map;
26 using std::unordered_multimap;
27 #include <unordered_set>
28 using std::unordered_set;
29 using std::unordered_multiset;
30 #include <list>
31 #include <map>
32 #include <set>
33 #include <sstream>
34 #include <string>
35 #include <utility>
36 using std::pair; using std::make_pair;
37 #include <vector>
38 using std::vector;
39
40
41 #include <fst/compat.h>
42 #include <fst/types.h>
43
44 #include <iostream>
45 #include <fstream>
46 #include <sstream>
47
48
49 //
50 // UTILITY FOR ERROR HANDLING
51 //
52
53 DECLARE_bool(fst_error_fatal);
54
55 #define FSTERROR() (FLAGS_fst_error_fatal ? LOG(FATAL) : LOG(ERROR))
56
57 namespace fst {
58
59 //
60 // UTILITIES FOR TYPE I/O
61 //
62
63 // Read some types from an input stream.
64
65 // Generic case.
66 template <typename T>
ReadType(istream & strm,T * t)67 inline istream &ReadType(istream &strm, T *t) {
68 return t->Read(strm);
69 }
70
71 // Fixed size, contiguous memory read.
72 #define READ_POD_TYPE(T) \
73 inline istream &ReadType(istream &strm, T *t) { \
74 return strm.read(reinterpret_cast<char *>(t), sizeof(T)); \
75 }
76
77 READ_POD_TYPE(bool);
78 READ_POD_TYPE(char);
79 READ_POD_TYPE(signed char);
80 READ_POD_TYPE(unsigned char);
81 READ_POD_TYPE(short);
82 READ_POD_TYPE(unsigned short);
83 READ_POD_TYPE(int);
84 READ_POD_TYPE(unsigned int);
85 READ_POD_TYPE(long);
86 READ_POD_TYPE(unsigned long);
87 READ_POD_TYPE(long long);
88 READ_POD_TYPE(unsigned long long);
89 READ_POD_TYPE(float);
90 READ_POD_TYPE(double);
91
92 // String case.
ReadType(istream & strm,string * s)93 inline istream &ReadType(istream &strm, string *s) { // NOLINT
94 s->clear();
95 int32 ns = 0;
96 strm.read(reinterpret_cast<char *>(&ns), sizeof(ns));
97 for (int i = 0; i < ns; ++i) {
98 char c;
99 strm.read(&c, 1);
100 *s += c;
101 }
102 return strm;
103 }
104
105 // Declare some types that write to an output stream
106
107 #define DECL_READ_TYPE2(C) \
108 template <typename S, typename T> \
109 inline istream &ReadType(istream &strm, C<S, T> *c) // NOLINT
110
111 #define DECL_READ_TYPE3(C) \
112 template <typename S, typename T, typename U> \
113 inline istream &ReadType(istream &strm, C<S, T, U> *c)
114
115 DECL_READ_TYPE2(vector);
116 DECL_READ_TYPE2(list);
117
118 DECL_READ_TYPE3(set);
119 DECL_READ_TYPE3(unordered_set);
120 DECL_READ_TYPE3(map);
121 DECL_READ_TYPE3(unordered_map);
122
123 // Pair case.
124 template <typename S, typename T>
ReadType(istream & strm,pair<S,T> * p)125 inline istream &ReadType(istream &strm, pair<S, T> *p) {
126 ReadType(strm, &p->first);
127 ReadType(strm, &p->second);
128 return strm;
129 }
130
131 template <typename S, typename T>
ReadType(istream & strm,pair<const S,T> * p)132 inline istream &ReadType(istream &strm, pair<const S, T> *p) {
133 ReadType(strm, const_cast<S *>(&p->first));
134 ReadType(strm, &p->second);
135 return strm;
136 }
137
138 // General case - no-op.
139 template <typename C>
StlReserve(C * c,int64 n)140 void StlReserve(C *c, int64 n) {}
141
142 // Specialization for vectors.
143 template <typename S, typename T>
StlReserve(vector<S,T> * c,int64 n)144 void StlReserve(vector<S, T> *c, int64 n) {
145 c->reserve(n);
146 }
147
148 // STL sequence container.
149 #define READ_STL_SEQ_TYPE(C) \
150 template <typename S, typename T> \
151 inline istream &ReadType(istream &strm, C<S, T> *c) { \
152 c->clear(); \
153 int64 n = 0; \
154 strm.read(reinterpret_cast<char *>(&n), sizeof(n)); \
155 StlReserve(c, n); \
156 for (ssize_t i = 0; i < n; ++i) { \
157 typename C<S, T>::value_type value; \
158 ReadType(strm, &value); \
159 c->insert(c->end(), value); \
160 } \
161 return strm; \
162 }
163
164 READ_STL_SEQ_TYPE(vector);
165 READ_STL_SEQ_TYPE(list);
166
167 // STL associative container.
168 #define READ_STL_ASSOC_TYPE(C) \
169 template <typename S, typename T, typename U> \
170 inline istream &ReadType(istream &strm, C<S, T, U> *c) { \
171 c->clear(); \
172 int64 n = 0; \
173 strm.read(reinterpret_cast<char *>(&n), sizeof(n)); \
174 for (ssize_t i = 0; i < n; ++i) { \
175 typename C<S, T, U>::value_type value; \
176 ReadType(strm, &value); \
177 c->insert(value); \
178 } \
179 return strm; \
180 }
181
182 READ_STL_ASSOC_TYPE(set);
183 READ_STL_ASSOC_TYPE(unordered_set);
184 READ_STL_ASSOC_TYPE(map);
185 READ_STL_ASSOC_TYPE(unordered_map);
186
187 // Write some types to an output stream.
188
189 // Generic case.
190 template <typename T>
WriteType(ostream & strm,const T t)191 inline ostream &WriteType(ostream &strm, const T t) {
192 t.Write(strm);
193 return strm;
194 }
195
196 // Fixed size, contiguous memory write.
197 #define WRITE_POD_TYPE(T) \
198 inline ostream &WriteType(ostream &strm, const T t) { \
199 return strm.write(reinterpret_cast<const char *>(&t), sizeof(T)); \
200 }
201
202 WRITE_POD_TYPE(bool);
203 WRITE_POD_TYPE(char);
204 WRITE_POD_TYPE(signed char);
205 WRITE_POD_TYPE(unsigned char);
206 WRITE_POD_TYPE(short);
207 WRITE_POD_TYPE(unsigned short);
208 WRITE_POD_TYPE(int);
209 WRITE_POD_TYPE(unsigned int);
210 WRITE_POD_TYPE(long);
211 WRITE_POD_TYPE(unsigned long);
212 WRITE_POD_TYPE(long long);
213 WRITE_POD_TYPE(unsigned long long);
214 WRITE_POD_TYPE(float);
215 WRITE_POD_TYPE(double);
216
217 // String case.
WriteType(ostream & strm,const string & s)218 inline ostream &WriteType(ostream &strm, const string &s) {
219 int32 ns = s.size();
220 strm.write(reinterpret_cast<const char *>(&ns), sizeof(ns));
221 return strm.write(s.data(), ns);
222 }
223
224 // Declare some types that write to an output stream
225
226 #define DECL_WRITE_TYPE2(C) \
227 template <typename S, typename T> \
228 inline ostream &WriteType(ostream &strm, const C<S, T> &c)
229
230 #define DECL_WRITE_TYPE3(C) \
231 template <typename S, typename T, typename U> \
232 inline ostream &WriteType(ostream &strm, const C<S, T, U> &c) // NOLINT
233
234 DECL_WRITE_TYPE2(vector);
235 DECL_WRITE_TYPE2(list);
236
237 DECL_WRITE_TYPE3(set);
238 DECL_WRITE_TYPE3(unordered_set);
239 DECL_WRITE_TYPE3(map);
240 DECL_WRITE_TYPE3(unordered_map);
241
242 // Pair case.
243 template <typename S, typename T>
WriteType(ostream & strm,const pair<S,T> & p)244 inline ostream &WriteType(ostream &strm, const pair<S, T> &p) { // NOLINT
245 WriteType(strm, p.first);
246 WriteType(strm, p.second);
247 return strm;
248 }
249
250 // STL sequence container.
251 #define WRITE_STL_SEQ_TYPE(C) \
252 template <typename S, typename T> \
253 inline ostream &WriteType(ostream &strm, const C<S, T> &c) { \
254 int64 n = c.size(); \
255 strm.write(reinterpret_cast<char *>(&n), sizeof(n)); \
256 for (typename C<S, T>::const_iterator it = c.begin(); \
257 it != c.end(); ++it) \
258 WriteType(strm, *it); \
259 return strm; \
260 }
261
262 WRITE_STL_SEQ_TYPE(vector);
263 WRITE_STL_SEQ_TYPE(list);
264
265 // STL associative container.
266 #define WRITE_STL_ASSOC_TYPE(C) \
267 template <typename S, typename T, typename U> \
268 inline ostream &WriteType(ostream &strm, const C<S, T, U> &c) { \
269 int64 n = c.size(); \
270 strm.write(reinterpret_cast<char *>(&n), sizeof(n)); \
271 for (typename C<S, T, U>::const_iterator it = c.begin(); \
272 it != c.end(); ++it) \
273 WriteType(strm, *it); \
274 return strm; \
275 }
276
277 WRITE_STL_ASSOC_TYPE(set);
278 WRITE_STL_ASSOC_TYPE(unordered_set);
279 WRITE_STL_ASSOC_TYPE(map);
280 WRITE_STL_ASSOC_TYPE(unordered_map);
281
282 // Utilities for converting between int64 or Weight and string.
283
284 int64 StrToInt64(const string &s, const string &src, size_t nline,
285 bool allow_negative, bool *error = 0);
286
287 template <typename Weight>
StrToWeight(const string & s,const string & src,size_t nline)288 Weight StrToWeight(const string &s, const string &src, size_t nline) {
289 Weight w;
290 istringstream strm(s);
291 strm >> w;
292 if (!strm) {
293 FSTERROR() << "StrToWeight: Bad weight = \"" << s
294 << "\", source = " << src << ", line = " << nline;
295 return Weight::NoWeight();
296 }
297 return w;
298 }
299
300 void Int64ToStr(int64 n, string *s);
301
302 template <typename Weight>
WeightToStr(Weight w,string * s)303 void WeightToStr(Weight w, string *s) {
304 ostringstream strm;
305 strm.precision(9);
306 strm << w;
307 s->append(strm.str().data(), strm.str().size());
308 }
309
310 // Utilities for reading/writing integer pairs (typically labels)
311
312 // Modifies line, vec consists of pointers to of buffer beginning
313 // with line.
314 void SplitToVector(char *line, const char *delim,
315 vector<char *> *vec, bool omit_empty_strings);
316
317 // Returns true on success
318 template <typename I>
319 bool ReadIntPairs(const string& filename,
320 vector<pair<I, I> >* pairs,
321 bool allow_negative = false) {
322 ifstream strm(filename.c_str());
323
324 if (!strm) {
325 LOG(ERROR) << "ReadIntPairs: Can't open file: " << filename;
326 return false;
327 }
328
329 const int kLineLen = 8096;
330 char line[kLineLen];
331 size_t nline = 0;
332
333 pairs->clear();
334 while (strm.getline(line, kLineLen)) {
335 ++nline;
336 vector<char *> col;
337 SplitToVector(line, "\n\t ", &col, true);
338 // empty line or comment?
339 if (col.size() == 0 || col[0][0] == '\0' || col[0][0] == '#')
340 continue;
341 if (col.size() != 2) {
342 LOG(ERROR) << "ReadIntPairs: Bad number of columns, "
343 << "file = " << filename << ", line = " << nline;
344 return false;
345 }
346
347 bool err;
348 I i1 = StrToInt64(col[0], filename, nline, allow_negative, &err);
349 if (err) return false;
350 I i2 = StrToInt64(col[1], filename, nline, allow_negative, &err);
351 if (err) return false;
352 pairs->push_back(make_pair(i1, i2));
353 }
354 return true;
355 }
356
357 // Returns true on success
358 template <typename I>
WriteIntPairs(const string & filename,const vector<pair<I,I>> & pairs)359 bool WriteIntPairs(const string& filename,
360 const vector<pair<I, I> >& pairs) {
361 ostream *strm = &cout;
362 if (!filename.empty()) {
363 strm = new ofstream(filename.c_str());
364 if (!*strm) {
365 LOG(ERROR) << "WriteIntPairs: Can't open file: " << filename;
366 return false;
367 }
368 }
369
370 for (ssize_t n = 0; n < pairs.size(); ++n)
371 *strm << pairs[n].first << "\t" << pairs[n].second << "\n";
372
373 if (!*strm) {
374 LOG(ERROR) << "WriteIntPairs: Write failed: "
375 << (filename.empty() ? "standard output" : filename);
376 return false;
377 }
378 if (strm != &cout)
379 delete strm;
380 return true;
381 }
382
383 // Utilities for reading/writing label pairs
384
385 template <typename Label>
386 bool ReadLabelPairs(const string& filename,
387 vector<pair<Label, Label> >* pairs,
388 bool allow_negative = false) {
389 return ReadIntPairs(filename, pairs, allow_negative);
390 }
391
392 template <typename Label>
WriteLabelPairs(const string & filename,vector<pair<Label,Label>> & pairs)393 bool WriteLabelPairs(const string& filename,
394 vector<pair<Label, Label> >& pairs) {
395 return WriteIntPairs(filename, pairs);
396 }
397
398 // Utilities for converting a type name to a legal C symbol.
399
400 void ConvertToLegalCSymbol(string *s);
401
402
403 //
404 // UTILITIES FOR STREAM I/O
405 //
406
407 bool AlignInput(istream &strm);
408 bool AlignOutput(ostream &strm);
409
410 //
411 // UTILITIES FOR PROTOCOL BUFFER I/O
412 //
413
414
415 // An associative container for which testing membership is
416 // faster than an STL set if members are restricted to an interval
417 // that excludes most non-members. A 'Key' must have ==, !=, and < defined.
418 // Element 'NoKey' should be a key that marks an uninitialized key and
419 // is otherwise unused. 'Find()' returns an STL const_iterator to the match
420 // found, otherwise it equals 'End()'.
421 template <class Key, Key NoKey>
422 class CompactSet {
423 public:
424 typedef typename set<Key>::const_iterator const_iterator;
425
CompactSet()426 CompactSet()
427 : min_key_(NoKey),
428 max_key_(NoKey) { }
429
CompactSet(const CompactSet<Key,NoKey> & compact_set)430 CompactSet(const CompactSet<Key, NoKey> &compact_set)
431 : set_(compact_set.set_),
432 min_key_(compact_set.min_key_),
433 max_key_(compact_set.max_key_) { }
434
Insert(Key key)435 void Insert(Key key) {
436 set_.insert(key);
437 if (min_key_ == NoKey || key < min_key_)
438 min_key_ = key;
439 if (max_key_ == NoKey || max_key_ < key)
440 max_key_ = key;
441 }
442
Erase(Key key)443 void Erase(Key key) {
444 set_.erase(key);
445 if (set_.empty()) {
446 min_key_ = max_key_ = NoKey;
447 } else if (key == min_key_) {
448 ++min_key_;
449 } else if (key == max_key_) {
450 --max_key_;
451 }
452 }
453
Clear()454 void Clear() {
455 set_.clear();
456 min_key_ = max_key_ = NoKey;
457 }
458
Find(Key key)459 const_iterator Find(Key key) const {
460 if (min_key_ == NoKey ||
461 key < min_key_ || max_key_ < key)
462 return set_.end();
463 else
464 return set_.find(key);
465 }
466
Member(Key key)467 bool Member(Key key) const {
468 if (min_key_ == NoKey || key < min_key_ || max_key_ < key) {
469 return false; // out of range
470 } else if (min_key_ != NoKey && max_key_ + 1 == min_key_ + set_.size()) {
471 return true; // dense range
472 } else {
473 return set_.find(key) != set_.end();
474 }
475 }
476
Begin()477 const_iterator Begin() const { return set_.begin(); }
478
End()479 const_iterator End() const { return set_.end(); }
480
481 // All stored keys are greater than or equal to this value.
LowerBound()482 Key LowerBound() const { return min_key_; }
483
484 // All stored keys are less than or equal to this value.
UpperBound()485 Key UpperBound() const { return max_key_; }
486
487 private:
488 set<Key> set_;
489 Key min_key_;
490 Key max_key_;
491
492 void operator=(const CompactSet<Key, NoKey> &); //disallow
493 };
494
495 } // namespace fst
496
497 #endif // FST_LIB_UTIL_H__
498