1 /* Copyright (C) 2011 J.F.Dockes
2  *   This program is free software; you can redistribute it and/or modify
3  *   it under the terms of the GNU General Public License as published by
4  *   the Free Software Foundation; either version 2 of the License, or
5  *   (at your option) any later version.
6  *
7  *   This program is distributed in the hope that it will be useful,
8  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *   GNU General Public License for more details.
11  *
12  *   You should have received a copy of the GNU General Public License
13  *   along with this program; if not, write to the
14  *   Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16  */
17 #ifndef _TERMPROC_H_INCLUDED_
18 #define _TERMPROC_H_INCLUDED_
19 
20 #include <vector>
21 #include <string>
22 #include <set>
23 #include <list>
24 
25 #include "textsplit.h"
26 #include "stoplist.h"
27 #include "smallut.h"
28 #include "utf8iter.h"
29 #include "unacpp.h"
30 #include "syngroups.h"
31 
32 namespace Rcl {
33 
34 /**
35  * Termproc objects take term tokens as input and do something
36  * with them: transform to lowercase, filter out stop words, generate n-grams,
37  * finally index or generate search clauses, etc. They are chained and can
38  * be arranged to form different pipelines depending on the desired processing
39  * steps: for example, optional stoplist or commongram processing.
40  *
41  * Shared processing steps are defined in this file. The first and last steps
42  * are usually defined in the specific module.
43  * - The front TermProc is typically chained from a TextSplit object
44  *   which generates the original terms, and calls takeword() from its
45  *   own takeword() method.
46  * - The last TermProc does something with the finalized terms, e.g. adds
47  *   them to the index.
48  */
49 
50 /**
51  * The base class takes care of chaining: all derived classes call its
52  * takeword() and flush() methods to ensure that terms go through the pipe.
53  */
54 class TermProc {
55 public:
TermProc(TermProc * next)56     TermProc(TermProc* next) : m_next(next) {}
~TermProc()57     virtual ~TermProc() {}
58     /* Copyconst and assignment forbidden */
59     TermProc(const TermProc &) = delete;
60     TermProc& operator=(const TermProc &) = delete;
takeword(const string & term,int pos,int bs,int be)61     virtual bool takeword(const string &term, int pos, int bs, int be) {
62         if (m_next)
63             return m_next->takeword(term, pos, bs, be);
64         return true;
65     }
66     // newpage() is like takeword(), but for page breaks.
newpage(int pos)67     virtual void newpage(int pos) {
68         if (m_next)
69             m_next->newpage(pos);
70     }
flush()71     virtual bool flush() {
72         if (m_next)
73             return m_next->flush();
74         return true;
75     }
76 private:
77     TermProc *m_next;
78 };
79 
80 /**
81  * Helper specialized TextSplit class, feeds the pipeline:
82  * - The takeword() method calls a TermProc->takeword().
83  * - The text_to_words() method also takes care of flushing.
84  * Both methods can be further specialized by the user (they should then call
85  * the base methods when they've done the local processing).
86  */
87 class TextSplitP : public TextSplit {
88 public:
89     TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
TextSplit(flags)90         : TextSplit(flags), m_prc(prc)  {}
91 
text_to_words(const string & in)92     virtual bool text_to_words(const string &in) {
93         bool ret = TextSplit::text_to_words(in);
94         if (m_prc && !m_prc->flush())
95             return false;
96         return ret;
97     }
98 
takeword(const string & term,int pos,int bs,int be)99     virtual bool takeword(const string& term, int pos, int bs, int be) {
100         if (m_prc)
101             return m_prc->takeword(term, pos, bs, be);
102         return true;
103     }
104 
newpage(int pos)105     virtual void newpage(int pos) {
106         if (m_prc)
107             return m_prc->newpage(pos);
108     }
109 
110 private:
111     TermProc *m_prc;
112 };
113 
114 /** Unaccent and lowercase term. If the index is
115  *  not case/diac-sensitive, this is usually the first step in the pipeline
116  */
117 class TermProcPrep : public TermProc {
118 public:
TermProcPrep(TermProc * nxt)119     TermProcPrep(TermProc *nxt)
120         : TermProc(nxt) {}
121 
takeword(const string & itrm,int pos,int bs,int be)122     virtual bool takeword(const string& itrm, int pos, int bs, int be) {
123         m_totalterms++;
124         string otrm;
125 
126         if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
127             LOGDEB("splitter::takeword: unac [" << itrm << "] failed\n");
128             m_unacerrors++;
129             // We don't generate a fatal error because of a bad term,
130             // but one has to put the limit somewhere
131             if (m_unacerrors > 500 &&
132                 (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
133                 // More than 1 error for every other term
134                 LOGERR("splitter::takeword: too many unac errors " <<
135                        m_unacerrors << "/"  << m_totalterms << "\n");
136                 return false;
137             }
138             return true;
139         }
140 
141         if (otrm.empty()) {
142             // It may happen in some weird cases that the output from
143             // unac is empty (if the word actually consisted entirely
144             // of diacritics ...)  The consequence is that a phrase
145             // search won't work without additional slack.
146             return true;
147         }
148 
149         // We should have a Japanese stemmer to handle this, but for
150         // experimenting, let's do it here: remove 'prolounged sound
151         // mark' and its halfwidth variant from the end of terms.
152         if ((unsigned int)otrm[0] > 127) {
153             Utf8Iter it(otrm);
154             if (TextSplit::isKATAKANA(*it)) {
155                 Utf8Iter itprev = it;
156                 while (*it != (unsigned int)-1) {
157                     itprev = it;
158                     it++;
159                 }
160                 if (*itprev == 0x30fc || *itprev == 0xff70) {
161                     otrm = otrm.substr(0, itprev.getBpos());
162                 }
163             }
164         }
165         if (otrm.empty()) {
166             return true;
167         }
168 
169         // It may also occur that unac introduces spaces in the string
170         // (when removing isolated accents, may happen for Greek
171         // for example). This is a pathological situation. We
172         // index all the resulting terms at the same pos because
173         // the surrounding code is not designed to handle a pos
174         // change in here. This means that phrase searches and
175         // snippets will be wrong, but at least searching for the
176         // terms will work.
177         bool hasspace = otrm.find(' ') != std::string::npos;
178         if (hasspace) {
179             std::vector<std::string> terms;
180             stringToTokens(otrm, terms, " ", true);
181             for (const auto& term : terms) {
182                 if (!TermProc::takeword(term, pos, bs, be)) {
183                     return false;
184                 }
185             }
186             return true;
187         }
188         return TermProc::takeword(otrm, pos, bs, be);
189     }
190 
flush()191     virtual bool flush() {
192         m_totalterms = m_unacerrors = 0;
193         return TermProc::flush();
194     }
195 
196 private:
197     int m_totalterms{0};
198     int m_unacerrors{0};
199 };
200 
201 /** Compare to stop words list and discard if match found */
202 class TermProcStop : public TermProc {
203 public:
TermProcStop(TermProc * nxt,const Rcl::StopList & stops)204     TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
205         : TermProc(nxt), m_stops(stops) {}
206 
takeword(const string & term,int pos,int bs,int be)207     virtual bool takeword(const string& term, int pos, int bs, int be) {
208         if (m_stops.isStop(term)) {
209             return true;
210         }
211         return TermProc::takeword(term, pos, bs, be);
212     }
213 
214 private:
215     const Rcl::StopList& m_stops;
216 };
217 
218 /** Generate multiword terms for multiword synonyms. This allows
219  * NEAR/PHRASE searches for multiword synonyms. */
220 class TermProcMulti : public TermProc {
221 public:
TermProcMulti(TermProc * nxt,const SynGroups & sg)222     TermProcMulti(TermProc *nxt, const SynGroups& sg)
223         : TermProc(nxt), m_groups(sg.getmultiwords()),
224           m_maxl(sg.getmultiwordsmaxlength()) {}
225 
takeword(const string & term,int pos,int bs,int be)226     virtual bool takeword(const string& term, int pos, int bs, int be) {
227         LOGDEB1("TermProcMulti::takeword[" << term << "] at pos " << pos <<"\n");
228         if (m_maxl < 2) {
229             // Should not have been pushed??
230             return TermProc::takeword(term, pos, bs, be);
231         }
232         m_terms.push_back(term);
233         if (m_terms.size() > m_maxl) {
234             m_terms.pop_front();
235         }
236         string comp;
237         int gsz{1};
238         for (const auto& gterm : m_terms) {
239             if (comp.empty()) {
240                 comp = gterm;
241                 continue;
242             } else {
243                 comp += " ";
244                 comp += gterm;
245                 gsz++;
246                 // We could optimize by not testing m_groups for sizes
247                 // which do not exist.
248                 // if not gsz in sizes continue;
249             }
250             if (m_groups.find(comp) != m_groups.end()) {
251                 LOGDEB1("Emitting multiword synonym: [" << comp << "] at pos " <<
252                        pos-gsz+1 << "\n");
253                 // TBD bs-be correct computation. Need to store the
254                 // values in a parallel list
255                 TermProc::takeword(comp, pos-gsz+1, bs-comp.size(), be);
256             }
257         }
258         return TermProc::takeword(term, pos, bs, be);
259     }
260 
261 private:
262     const std::set<std::string>& m_groups;
263     size_t m_maxl{0};
264     std::list<std::string> m_terms;
265 };
266 
267 /** Handle common-gram generation: combine frequent terms with neighbours to
268  *  shorten the positions lists for phrase searches.
269  *  NOTE: This does not currently work because of bad interaction with the
270  *  spans (ie john@domain.com) generation in textsplit. Not used, kept for
271  *  testing only
272  */
273 class TermProcCommongrams : public TermProc {
274 public:
TermProcCommongrams(TermProc * nxt,const Rcl::StopList & stops)275     TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
276         : TermProc(nxt), m_stops(stops), m_onlygrams(false) {}
277 
takeword(const string & term,int pos,int bs,int be)278     virtual bool takeword(const string& term, int pos, int bs, int be) {
279         LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " <<
280                 be << " [" << term << "]\n");
281         bool isstop = m_stops.isStop(term);
282         bool twogramemit = false;
283 
284         if (!m_prevterm.empty() && (m_prevstop || isstop)) {
285             // create 2-gram. space unnecessary but improves
286             // the readability of queries
287             string twogram;
288             twogram.swap(m_prevterm);
289             twogram.append(1, ' ');
290             twogram += term;
291             // When emitting a complex term we set the bps to 0. This may
292             // be used by our clients
293             if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
294                 return false;
295             twogramemit = true;
296 #if 0
297             if (m_stops.isStop(twogram)) {
298                 firstword = twogram;
299                 isstop = false;
300             }
301 #endif
302         }
303 
304         m_prevterm = term;
305         m_prevstop = isstop;
306         m_prevpos = pos;
307         m_prevsent = false;
308         m_prevbs = bs;
309         m_prevbe = be;
310         // If flags allow, emit the bare term at the current pos.
311         if (!m_onlygrams || (!isstop && !twogramemit)) {
312             if (!TermProc::takeword(term, pos, bs, be))
313                 return false;
314             m_prevsent = true;
315         }
316 
317         return true;
318     }
319 
flush()320     virtual bool flush() {
321         if (!m_prevsent && !m_prevterm.empty())
322             if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
323                 return false;
324 
325         m_prevterm.clear();
326         m_prevsent = true;
327         return TermProc::flush();
328     }
onlygrams(bool on)329     void onlygrams(bool on) {
330         m_onlygrams = on;
331     }
332 private:
333     // The stoplist we're using
334     const Rcl::StopList& m_stops;
335     // Remembered data for the last processed term
336     string m_prevterm;
337     bool   m_prevstop;
338     int    m_prevpos;
339     int    m_prevbs;
340     int    m_prevbe;
341     bool   m_prevsent;
342     // If this is set, we only emit longest grams
343     bool   m_onlygrams;
344 };
345 
346 
347 } // End namespace Rcl
348 
349 #endif /* _TERMPROC_H_INCLUDED_ */
350 
351