1 #include "util/file_stream.hh"
2 #include "util/file_piece.hh"
3 #include "util/murmur_hash.hh"
4 #include "util/pool.hh"
5 #include "util/string_piece.hh"
6 #include "util/string_piece_hash.hh"
7 #include "util/tokenize_piece.hh"
8 
9 #include <boost/unordered_map.hpp>
10 #include <boost/unordered_set.hpp>
11 
12 #include <cstddef>
13 #include <vector>
14 
15 namespace {
16 
17 struct MutablePiece {
18   mutable StringPiece behind;
operator ==__anon33ef727c0111::MutablePiece19   bool operator==(const MutablePiece &other) const {
20     return behind == other.behind;
21   }
22 };
23 
hash_value(const MutablePiece & m)24 std::size_t hash_value(const MutablePiece &m) {
25   return hash_value(m.behind);
26 }
27 
28 class InternString {
29   public:
Add(StringPiece str)30     const char *Add(StringPiece str) {
31       MutablePiece mut;
32       mut.behind = str;
33       std::pair<boost::unordered_set<MutablePiece>::iterator, bool> res(strs_.insert(mut));
34       if (res.second) {
35         void *mem = backing_.Allocate(str.size() + 1);
36         memcpy(mem, str.data(), str.size());
37         static_cast<char*>(mem)[str.size()] = 0;
38         res.first->behind = StringPiece(static_cast<char*>(mem), str.size());
39       }
40       return res.first->behind.data();
41     }
42 
43   private:
44     util::Pool backing_;
45     boost::unordered_set<MutablePiece> strs_;
46 };
47 
48 class TargetWords {
49   public:
Introduce(StringPiece source)50     void Introduce(StringPiece source) {
51       vocab_.resize(vocab_.size() + 1);
52       std::vector<unsigned int> temp(1, vocab_.size() - 1);
53       Add(temp, source);
54     }
55 
Add(const std::vector<unsigned int> & sentences,StringPiece target)56     void Add(const std::vector<unsigned int> &sentences, StringPiece target) {
57       if (sentences.empty()) return;
58       interns_.clear();
59       for (util::TokenIter<util::SingleCharacter, true> i(target, ' '); i; ++i) {
60         interns_.push_back(intern_.Add(*i));
61       }
62       for (std::vector<unsigned int>::const_iterator i(sentences.begin()); i != sentences.end(); ++i) {
63         boost::unordered_set<const char *> &vocab = vocab_[*i];
64         for (std::vector<const char *>::const_iterator j = interns_.begin(); j != interns_.end(); ++j) {
65           vocab.insert(*j);
66         }
67       }
68     }
69 
Print() const70     void Print() const {
71       util::FileStream out(1);
72       for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) {
73         for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) {
74           out << *j << ' ';
75         }
76         out << '\n';
77       }
78     }
79 
80   private:
81     InternString intern_;
82 
83     std::vector<boost::unordered_set<const char *> > vocab_;
84 
85     // Temporary in Add.
86     std::vector<const char *> interns_;
87 };
88 
89 class Input {
90   public:
Input(std::size_t max_length)91     explicit Input(std::size_t max_length)
92       : max_length_(max_length), sentence_id_(0), empty_() {}
93 
AddSentence(StringPiece sentence,TargetWords & targets)94     void AddSentence(StringPiece sentence, TargetWords &targets) {
95       canonical_.clear();
96       starts_.clear();
97       starts_.push_back(0);
98       for (util::TokenIter<util::AnyCharacter, true> i(sentence, StringPiece("\0 \t", 3)); i; ++i) {
99         canonical_.append(i->data(), i->size());
100         canonical_ += ' ';
101         starts_.push_back(canonical_.size());
102       }
103       targets.Introduce(canonical_);
104       for (std::size_t i = 0; i < starts_.size() - 1; ++i) {
105         std::size_t subtract = starts_[i];
106         const char *start = &canonical_[subtract];
107         for (std::size_t j = i + 1; j < std::min(starts_.size(), i + max_length_ + 1); ++j) {
108           map_[util::MurmurHash64A(start, &canonical_[starts_[j]] - start - 1)].push_back(sentence_id_);
109         }
110       }
111       ++sentence_id_;
112     }
113 
114     // Assumes single space-delimited phrase with no space at the beginning or end.
Matches(StringPiece phrase) const115     const std::vector<unsigned int> &Matches(StringPiece phrase) const {
116       Map::const_iterator i = map_.find(util::MurmurHash64A(phrase.data(), phrase.size()));
117       return i == map_.end() ? empty_ : i->second;
118     }
119 
120   private:
121     const std::size_t max_length_;
122 
123     // hash of phrase is the key, array of sentences is the value.
124     typedef boost::unordered_map<uint64_t, std::vector<unsigned int> > Map;
125     Map map_;
126 
127     std::size_t sentence_id_;
128 
129     // Temporaries in AddSentence.
130     std::string canonical_;
131     std::vector<std::size_t> starts_;
132 
133     const std::vector<unsigned int> empty_;
134 };
135 
136 } // namespace
137 
main(int argc,char * argv[])138 int main(int argc, char *argv[]) {
139   if (argc != 2) {
140     std::cerr << "Expected source text on the command line" << std::endl;
141     return 1;
142   }
143   Input input(7);
144   TargetWords targets;
145   try {
146     util::FilePiece inputs(argv[1], &std::cerr);
147     while (true)
148       input.AddSentence(inputs.ReadLine(), targets);
149   } catch (const util::EndOfFileException &e) {}
150 
151   util::FilePiece table(0, NULL, &std::cerr);
152   StringPiece line;
153   const StringPiece pipes("|||");
154   while (true) {
155     try {
156       line = table.ReadLine();
157     } catch (const util::EndOfFileException &e) { break; }
158     util::TokenIter<util::MultiCharacter> it(line, pipes);
159     StringPiece source(*it);
160     if (!source.empty() && source[source.size() - 1] == ' ')
161       source.remove_suffix(1);
162     targets.Add(input.Matches(source), *++it);
163   }
164   targets.Print();
165 }
166