1 //
2 // Copyright RIME Developers
3 // Distributed under the BSD License
4 //
5 // simplistic sentence-making
6 //
7 // 2011-10-06 GONG Chen <chen.sst@gmail.com>
8 //
9 #include <algorithm>
10 #include <functional>
11 #include <rime/candidate.h>
12 #include <rime/config.h>
13 #include <rime/dict/vocabulary.h>
14 #include <rime/gear/grammar.h>
15 #include <rime/gear/poet.h>
16 
17 namespace rime {
18 
19 // internal data structure used during the sentence making process.
20 // the output line of the algorithm is transformed to an<Sentence>.
21 struct Line {
22   // be sure the pointer to predecessor Line object is stable. it works since
23   // pointer to values stored in std::map and std::unordered_map are stable.
24   const Line* predecessor;
25   // as long as the word graph lives, pointers to entries are valid.
26   const DictEntry* entry;
27   size_t end_pos;
28   double weight;
29 
30   static const Line kEmpty;
31 
emptyrime::Line32   bool empty() const {
33     return !predecessor && !entry;
34   }
35 
last_wordrime::Line36   string last_word() const {
37     return entry ? entry->text : string();
38   }
39 
40   struct Components {
41     vector<const Line*> lines;
42 
Componentsrime::Line::Components43     Components(const Line* line) {
44       for (const Line* cursor = line;
45            !cursor->empty();
46            cursor = cursor->predecessor) {
47           lines.push_back(cursor);
48       }
49     }
50 
beginrime::Line::Components51     decltype(lines.crbegin()) begin() const { return lines.crbegin(); }
endrime::Line::Components52     decltype(lines.crend()) end() const { return lines.crend(); }
53   };
54 
componentsrime::Line55   Components components() const { return Components(this); }
56 
contextrime::Line57   string context() const {
58     // look back 2 words
59     return empty() ? string() :
60         !predecessor || predecessor->empty() ? last_word() :
61         predecessor->last_word() + last_word();
62   }
63 
word_lengthsrime::Line64   vector<size_t> word_lengths() const {
65     vector<size_t> lengths;
66     size_t last_end_pos = 0;
67     for (const auto* c : components()) {
68       lengths.push_back(c->end_pos - last_end_pos);
69       last_end_pos = c->end_pos;
70     }
71     return lengths;
72   }
73 };
74 
75 const Line Line::kEmpty{nullptr, nullptr, 0, 0.0};
76 
create_grammar(Config * config)77 inline static Grammar* create_grammar(Config* config) {
78   if (auto* grammar = Grammar::Require("grammar")) {
79     return grammar->Create(config);
80   }
81   return nullptr;
82 }
83 
Poet(const Language * language,Config * config,Compare compare)84 Poet::Poet(const Language* language, Config* config, Compare compare)
85     : language_(language),
86       grammar_(create_grammar(config)),
87       compare_(compare) {}
88 
~Poet()89 Poet::~Poet() {}
90 
CompareWeight(const Line & one,const Line & other)91 bool Poet::CompareWeight(const Line& one, const Line& other) {
92   return one.weight < other.weight;
93 }
94 
95 // returns true if one is less than other.
LeftAssociateCompare(const Line & one,const Line & other)96 bool Poet::LeftAssociateCompare(const Line& one, const Line& other) {
97   if (one.weight < other.weight) return true;
98   if (one.weight == other.weight) {
99     auto one_word_lens = one.word_lengths();
100     auto other_word_lens = other.word_lengths();
101     // less words is more favorable
102     if (one_word_lens.size() > other_word_lens.size()) return true;
103     if (one_word_lens.size() == other_word_lens.size()) {
104       return std::lexicographical_compare(
105           one_word_lens.begin(), one_word_lens.end(),
106           other_word_lens.begin(), other_word_lens.end());
107     }
108   }
109   return false;
110 }
111 
112 // keep the best line candidate per last phrase
113 using LineCandidates = hash_map<string, Line>;
114 
115 template <int N>
find_top_candidates(const LineCandidates & candidates,Poet::Compare compare)116 static vector<const Line*> find_top_candidates(
117     const LineCandidates& candidates, Poet::Compare compare) {
118   vector<const Line*> top;
119   top.reserve(N + 1);
120   for (const auto& candidate : candidates) {
121     auto pos = std::upper_bound(
122         top.begin(), top.end(), &candidate.second,
123         [&](const Line* a, const Line* b) { return compare(*b, *a); }); // desc
124     if (pos - top.begin() >= N) continue;
125     top.insert(pos, &candidate.second);
126     if (top.size() > N) top.pop_back();
127   }
128   return top;
129 }
130 
131 using UpdateLineCandidate = function<void (const Line& candidate)>;
132 
133 struct BeamSearch {
134   using State = LineCandidates;
135 
136   static constexpr int kMaxLineCandidates = 7;
137 
Initiaterime::BeamSearch138   static void Initiate(State& initial_state) {
139     initial_state.emplace("", Line::kEmpty);
140   }
141 
ForEachCandidaterime::BeamSearch142   static void ForEachCandidate(const State& state,
143                                Poet::Compare compare,
144                                UpdateLineCandidate update) {
145     auto top_candidates =
146         find_top_candidates<kMaxLineCandidates>(state, compare);
147     for (const auto* candidate : top_candidates) {
148       update(*candidate);
149     }
150   }
151 
BestLineToUpdaterime::BeamSearch152   static Line& BestLineToUpdate(State& state, const Line& new_line) {
153     const auto& key = new_line.last_word();
154     return state[key];
155   }
156 
BestLineInStaterime::BeamSearch157   static const Line& BestLineInState(const State& final_state,
158                                      Poet::Compare compare) {
159     const Line* best = nullptr;
160     for (const auto& candidate : final_state) {
161       if (!best || compare(*best, candidate.second)) {
162         best = &candidate.second;
163       }
164     }
165     return best ? *best : Line::kEmpty;
166   }
167 };
168 
169 struct DynamicProgramming {
170   using State = Line;
171 
Initiaterime::DynamicProgramming172   static void Initiate(State& initial_state) {
173     initial_state = Line::kEmpty;
174   }
175 
ForEachCandidaterime::DynamicProgramming176   static void ForEachCandidate(const State& state,
177                                Poet::Compare compare,
178                                UpdateLineCandidate update) {
179     update(state);
180   }
181 
BestLineToUpdaterime::DynamicProgramming182   static Line& BestLineToUpdate(State& state, const Line& new_line) {
183     return state;
184   }
185 
BestLineInStaterime::DynamicProgramming186   static const Line& BestLineInState(const State& final_state,
187                                      Poet::Compare compare) {
188     return final_state;
189   }
190 };
191 
192 template <class Strategy>
MakeSentenceWithStrategy(const WordGraph & graph,size_t total_length,const string & preceding_text)193 an<Sentence> Poet::MakeSentenceWithStrategy(const WordGraph& graph,
194                                             size_t total_length,
195                                             const string& preceding_text) {
196   map<int, typename Strategy::State> states;
197   Strategy::Initiate(states[0]);
198   for (const auto& sv : graph) {
199     size_t start_pos = sv.first;
200     if (states.find(start_pos) == states.end())
201       continue;
202     DLOG(INFO) << "start pos: " << start_pos;
203     const auto& source_state = states[start_pos];
204     const auto update =
205         [this, &states, &sv, start_pos, total_length, &preceding_text]
206         (const Line& candidate) {
207           for (const auto& ev : sv.second) {
208             size_t end_pos = ev.first;
209             if (start_pos == 0 && end_pos == total_length)
210               continue;  // exclude single word from the result
211             DLOG(INFO) << "end pos: " << end_pos;
212             bool is_rear = end_pos == total_length;
213             auto& target_state = states[end_pos];
214             // extend candidates with dict entries on a valid edge.
215             const DictEntryList& entries = ev.second;
216             for (const auto& entry : entries) {
217               const string& context =
218                   candidate.empty() ? preceding_text : candidate.context();
219               double weight = candidate.weight +
220                               Grammar::Evaluate(context,
221                                                 entry->text,
222                                                 entry->weight,
223                                                 is_rear,
224                                                 grammar_.get());
225               Line new_line{&candidate, entry.get(), end_pos, weight};
226               Line& best = Strategy::BestLineToUpdate(target_state, new_line);
227               if (best.empty() || compare_(best, new_line)) {
228                 DLOG(INFO) << "updated line ending at " << end_pos
229                            << " with text: ..." << new_line.last_word()
230                            << " weight: " << new_line.weight;
231                 best = new_line;
232               }
233             }
234           }
235         };
236     Strategy::ForEachCandidate(source_state, compare_, update);
237   }
238   auto found = states.find(total_length);
239   if (found == states.end() || found->second.empty())
240     return nullptr;
241   const Line& best = Strategy::BestLineInState(found->second, compare_);
242   auto sentence = New<Sentence>(language_);
243   for (const auto* c : best.components()) {
244     if (!c->entry) continue;
245     sentence->Extend(*c->entry, c->end_pos, c->weight);
246   }
247   return sentence;
248 }
249 
MakeSentence(const WordGraph & graph,size_t total_length,const string & preceding_text)250 an<Sentence> Poet::MakeSentence(const WordGraph& graph,
251                                 size_t total_length,
252                                 const string& preceding_text) {
253   return grammar_ ?
254       MakeSentenceWithStrategy<BeamSearch>(
255           graph, total_length, preceding_text) :
256       MakeSentenceWithStrategy<DynamicProgramming>(
257           graph, total_length, preceding_text);
258 }
259 
260 }  // namespace rime
261