1 #include "util/file_stream.hh"
2 #include "util/file_piece.hh"
3 #include "util/murmur_hash.hh"
4 #include "util/pool.hh"
5 #include "util/string_piece.hh"
6 #include "util/string_piece_hash.hh"
7 #include "util/tokenize_piece.hh"
8
9 #include <boost/unordered_map.hpp>
10 #include <boost/unordered_set.hpp>
11
12 #include <cstddef>
13 #include <vector>
14
15 namespace {
16
17 struct MutablePiece {
18 mutable StringPiece behind;
operator ==__anon33ef727c0111::MutablePiece19 bool operator==(const MutablePiece &other) const {
20 return behind == other.behind;
21 }
22 };
23
hash_value(const MutablePiece & m)24 std::size_t hash_value(const MutablePiece &m) {
25 return hash_value(m.behind);
26 }
27
28 class InternString {
29 public:
Add(StringPiece str)30 const char *Add(StringPiece str) {
31 MutablePiece mut;
32 mut.behind = str;
33 std::pair<boost::unordered_set<MutablePiece>::iterator, bool> res(strs_.insert(mut));
34 if (res.second) {
35 void *mem = backing_.Allocate(str.size() + 1);
36 memcpy(mem, str.data(), str.size());
37 static_cast<char*>(mem)[str.size()] = 0;
38 res.first->behind = StringPiece(static_cast<char*>(mem), str.size());
39 }
40 return res.first->behind.data();
41 }
42
43 private:
44 util::Pool backing_;
45 boost::unordered_set<MutablePiece> strs_;
46 };
47
48 class TargetWords {
49 public:
Introduce(StringPiece source)50 void Introduce(StringPiece source) {
51 vocab_.resize(vocab_.size() + 1);
52 std::vector<unsigned int> temp(1, vocab_.size() - 1);
53 Add(temp, source);
54 }
55
Add(const std::vector<unsigned int> & sentences,StringPiece target)56 void Add(const std::vector<unsigned int> &sentences, StringPiece target) {
57 if (sentences.empty()) return;
58 interns_.clear();
59 for (util::TokenIter<util::SingleCharacter, true> i(target, ' '); i; ++i) {
60 interns_.push_back(intern_.Add(*i));
61 }
62 for (std::vector<unsigned int>::const_iterator i(sentences.begin()); i != sentences.end(); ++i) {
63 boost::unordered_set<const char *> &vocab = vocab_[*i];
64 for (std::vector<const char *>::const_iterator j = interns_.begin(); j != interns_.end(); ++j) {
65 vocab.insert(*j);
66 }
67 }
68 }
69
Print() const70 void Print() const {
71 util::FileStream out(1);
72 for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) {
73 for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) {
74 out << *j << ' ';
75 }
76 out << '\n';
77 }
78 }
79
80 private:
81 InternString intern_;
82
83 std::vector<boost::unordered_set<const char *> > vocab_;
84
85 // Temporary in Add.
86 std::vector<const char *> interns_;
87 };
88
89 class Input {
90 public:
Input(std::size_t max_length)91 explicit Input(std::size_t max_length)
92 : max_length_(max_length), sentence_id_(0), empty_() {}
93
AddSentence(StringPiece sentence,TargetWords & targets)94 void AddSentence(StringPiece sentence, TargetWords &targets) {
95 canonical_.clear();
96 starts_.clear();
97 starts_.push_back(0);
98 for (util::TokenIter<util::AnyCharacter, true> i(sentence, StringPiece("\0 \t", 3)); i; ++i) {
99 canonical_.append(i->data(), i->size());
100 canonical_ += ' ';
101 starts_.push_back(canonical_.size());
102 }
103 targets.Introduce(canonical_);
104 for (std::size_t i = 0; i < starts_.size() - 1; ++i) {
105 std::size_t subtract = starts_[i];
106 const char *start = &canonical_[subtract];
107 for (std::size_t j = i + 1; j < std::min(starts_.size(), i + max_length_ + 1); ++j) {
108 map_[util::MurmurHash64A(start, &canonical_[starts_[j]] - start - 1)].push_back(sentence_id_);
109 }
110 }
111 ++sentence_id_;
112 }
113
114 // Assumes single space-delimited phrase with no space at the beginning or end.
Matches(StringPiece phrase) const115 const std::vector<unsigned int> &Matches(StringPiece phrase) const {
116 Map::const_iterator i = map_.find(util::MurmurHash64A(phrase.data(), phrase.size()));
117 return i == map_.end() ? empty_ : i->second;
118 }
119
120 private:
121 const std::size_t max_length_;
122
123 // hash of phrase is the key, array of sentences is the value.
124 typedef boost::unordered_map<uint64_t, std::vector<unsigned int> > Map;
125 Map map_;
126
127 std::size_t sentence_id_;
128
129 // Temporaries in AddSentence.
130 std::string canonical_;
131 std::vector<std::size_t> starts_;
132
133 const std::vector<unsigned int> empty_;
134 };
135
136 } // namespace
137
main(int argc,char * argv[])138 int main(int argc, char *argv[]) {
139 if (argc != 2) {
140 std::cerr << "Expected source text on the command line" << std::endl;
141 return 1;
142 }
143 Input input(7);
144 TargetWords targets;
145 try {
146 util::FilePiece inputs(argv[1], &std::cerr);
147 while (true)
148 input.AddSentence(inputs.ReadLine(), targets);
149 } catch (const util::EndOfFileException &e) {}
150
151 util::FilePiece table(0, NULL, &std::cerr);
152 StringPiece line;
153 const StringPiece pipes("|||");
154 while (true) {
155 try {
156 line = table.ReadLine();
157 } catch (const util::EndOfFileException &e) { break; }
158 util::TokenIter<util::MultiCharacter> it(line, pipes);
159 StringPiece source(*it);
160 if (!source.empty() && source[source.size() - 1] == ' ')
161 source.remove_suffix(1);
162 targets.Add(input.Matches(source), *++it);
163 }
164 targets.Print();
165 }
166