1 /* Copyright (C) 2011 J.F.Dockes 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the 14 * Free Software Foundation, Inc., 15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 16 */ 17 #ifndef _TERMPROC_H_INCLUDED_ 18 #define _TERMPROC_H_INCLUDED_ 19 20 #include <vector> 21 #include <string> 22 #include <set> 23 #include <list> 24 25 #include "textsplit.h" 26 #include "stoplist.h" 27 #include "smallut.h" 28 #include "utf8iter.h" 29 #include "unacpp.h" 30 #include "syngroups.h" 31 32 namespace Rcl { 33 34 /** 35 * Termproc objects take term tokens as input and do something 36 * with them: transform to lowercase, filter out stop words, generate n-grams, 37 * finally index or generate search clauses, etc. They are chained and can 38 * be arranged to form different pipelines depending on the desired processing 39 * steps: for example, optional stoplist or commongram processing. 40 * 41 * Shared processing steps are defined in this file. The first and last steps 42 * are usually defined in the specific module. 43 * - The front TermProc is typically chained from a TextSplit object 44 * which generates the original terms, and calls takeword() from its 45 * own takeword() method. 46 * - The last TermProc does something with the finalized terms, e.g. adds 47 * them to the index. 48 */ 49 50 /** 51 * The base class takes care of chaining: all derived classes call its 52 * takeword() and flush() methods to ensure that terms go through the pipe. 53 */ 54 class TermProc { 55 public: TermProc(TermProc * next)56 TermProc(TermProc* next) : m_next(next) {} ~TermProc()57 virtual ~TermProc() {} 58 /* Copyconst and assignment forbidden */ 59 TermProc(const TermProc &) = delete; 60 TermProc& operator=(const TermProc &) = delete; takeword(const string & term,int pos,int bs,int be)61 virtual bool takeword(const string &term, int pos, int bs, int be) { 62 if (m_next) 63 return m_next->takeword(term, pos, bs, be); 64 return true; 65 } 66 // newpage() is like takeword(), but for page breaks. newpage(int pos)67 virtual void newpage(int pos) { 68 if (m_next) 69 m_next->newpage(pos); 70 } flush()71 virtual bool flush() { 72 if (m_next) 73 return m_next->flush(); 74 return true; 75 } 76 private: 77 TermProc *m_next; 78 }; 79 80 /** 81 * Helper specialized TextSplit class, feeds the pipeline: 82 * - The takeword() method calls a TermProc->takeword(). 83 * - The text_to_words() method also takes care of flushing. 84 * Both methods can be further specialized by the user (they should then call 85 * the base methods when they've done the local processing). 86 */ 87 class TextSplitP : public TextSplit { 88 public: 89 TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE)) TextSplit(flags)90 : TextSplit(flags), m_prc(prc) {} 91 text_to_words(const string & in)92 virtual bool text_to_words(const string &in) { 93 bool ret = TextSplit::text_to_words(in); 94 if (m_prc && !m_prc->flush()) 95 return false; 96 return ret; 97 } 98 takeword(const string & term,int pos,int bs,int be)99 virtual bool takeword(const string& term, int pos, int bs, int be) { 100 if (m_prc) 101 return m_prc->takeword(term, pos, bs, be); 102 return true; 103 } 104 newpage(int pos)105 virtual void newpage(int pos) { 106 if (m_prc) 107 return m_prc->newpage(pos); 108 } 109 110 private: 111 TermProc *m_prc; 112 }; 113 114 /** Unaccent and lowercase term. If the index is 115 * not case/diac-sensitive, this is usually the first step in the pipeline 116 */ 117 class TermProcPrep : public TermProc { 118 public: TermProcPrep(TermProc * nxt)119 TermProcPrep(TermProc *nxt) 120 : TermProc(nxt) {} 121 takeword(const string & itrm,int pos,int bs,int be)122 virtual bool takeword(const string& itrm, int pos, int bs, int be) { 123 m_totalterms++; 124 string otrm; 125 126 if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) { 127 LOGDEB("splitter::takeword: unac [" << itrm << "] failed\n"); 128 m_unacerrors++; 129 // We don't generate a fatal error because of a bad term, 130 // but one has to put the limit somewhere 131 if (m_unacerrors > 500 && 132 (double(m_totalterms) / double(m_unacerrors)) < 2.0) { 133 // More than 1 error for every other term 134 LOGERR("splitter::takeword: too many unac errors " << 135 m_unacerrors << "/" << m_totalterms << "\n"); 136 return false; 137 } 138 return true; 139 } 140 141 if (otrm.empty()) { 142 // It may happen in some weird cases that the output from 143 // unac is empty (if the word actually consisted entirely 144 // of diacritics ...) The consequence is that a phrase 145 // search won't work without additional slack. 146 return true; 147 } 148 149 // We should have a Japanese stemmer to handle this, but for 150 // experimenting, let's do it here: remove 'prolounged sound 151 // mark' and its halfwidth variant from the end of terms. 152 if ((unsigned int)otrm[0] > 127) { 153 Utf8Iter it(otrm); 154 if (TextSplit::isKATAKANA(*it)) { 155 Utf8Iter itprev = it; 156 while (*it != (unsigned int)-1) { 157 itprev = it; 158 it++; 159 } 160 if (*itprev == 0x30fc || *itprev == 0xff70) { 161 otrm = otrm.substr(0, itprev.getBpos()); 162 } 163 } 164 } 165 if (otrm.empty()) { 166 return true; 167 } 168 169 // It may also occur that unac introduces spaces in the string 170 // (when removing isolated accents, may happen for Greek 171 // for example). This is a pathological situation. We 172 // index all the resulting terms at the same pos because 173 // the surrounding code is not designed to handle a pos 174 // change in here. This means that phrase searches and 175 // snippets will be wrong, but at least searching for the 176 // terms will work. 177 bool hasspace = otrm.find(' ') != std::string::npos; 178 if (hasspace) { 179 std::vector<std::string> terms; 180 stringToTokens(otrm, terms, " ", true); 181 for (const auto& term : terms) { 182 if (!TermProc::takeword(term, pos, bs, be)) { 183 return false; 184 } 185 } 186 return true; 187 } 188 return TermProc::takeword(otrm, pos, bs, be); 189 } 190 flush()191 virtual bool flush() { 192 m_totalterms = m_unacerrors = 0; 193 return TermProc::flush(); 194 } 195 196 private: 197 int m_totalterms{0}; 198 int m_unacerrors{0}; 199 }; 200 201 /** Compare to stop words list and discard if match found */ 202 class TermProcStop : public TermProc { 203 public: TermProcStop(TermProc * nxt,const Rcl::StopList & stops)204 TermProcStop(TermProc *nxt, const Rcl::StopList& stops) 205 : TermProc(nxt), m_stops(stops) {} 206 takeword(const string & term,int pos,int bs,int be)207 virtual bool takeword(const string& term, int pos, int bs, int be) { 208 if (m_stops.isStop(term)) { 209 return true; 210 } 211 return TermProc::takeword(term, pos, bs, be); 212 } 213 214 private: 215 const Rcl::StopList& m_stops; 216 }; 217 218 /** Generate multiword terms for multiword synonyms. This allows 219 * NEAR/PHRASE searches for multiword synonyms. */ 220 class TermProcMulti : public TermProc { 221 public: TermProcMulti(TermProc * nxt,const SynGroups & sg)222 TermProcMulti(TermProc *nxt, const SynGroups& sg) 223 : TermProc(nxt), m_groups(sg.getmultiwords()), 224 m_maxl(sg.getmultiwordsmaxlength()) {} 225 takeword(const string & term,int pos,int bs,int be)226 virtual bool takeword(const string& term, int pos, int bs, int be) { 227 LOGDEB1("TermProcMulti::takeword[" << term << "] at pos " << pos <<"\n"); 228 if (m_maxl < 2) { 229 // Should not have been pushed?? 230 return TermProc::takeword(term, pos, bs, be); 231 } 232 m_terms.push_back(term); 233 if (m_terms.size() > m_maxl) { 234 m_terms.pop_front(); 235 } 236 string comp; 237 int gsz{1}; 238 for (const auto& gterm : m_terms) { 239 if (comp.empty()) { 240 comp = gterm; 241 continue; 242 } else { 243 comp += " "; 244 comp += gterm; 245 gsz++; 246 // We could optimize by not testing m_groups for sizes 247 // which do not exist. 248 // if not gsz in sizes continue; 249 } 250 if (m_groups.find(comp) != m_groups.end()) { 251 LOGDEB1("Emitting multiword synonym: [" << comp << "] at pos " << 252 pos-gsz+1 << "\n"); 253 // TBD bs-be correct computation. Need to store the 254 // values in a parallel list 255 TermProc::takeword(comp, pos-gsz+1, bs-comp.size(), be); 256 } 257 } 258 return TermProc::takeword(term, pos, bs, be); 259 } 260 261 private: 262 const std::set<std::string>& m_groups; 263 size_t m_maxl{0}; 264 std::list<std::string> m_terms; 265 }; 266 267 /** Handle common-gram generation: combine frequent terms with neighbours to 268 * shorten the positions lists for phrase searches. 269 * NOTE: This does not currently work because of bad interaction with the 270 * spans (ie john@domain.com) generation in textsplit. Not used, kept for 271 * testing only 272 */ 273 class TermProcCommongrams : public TermProc { 274 public: TermProcCommongrams(TermProc * nxt,const Rcl::StopList & stops)275 TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops) 276 : TermProc(nxt), m_stops(stops), m_onlygrams(false) {} 277 takeword(const string & term,int pos,int bs,int be)278 virtual bool takeword(const string& term, int pos, int bs, int be) { 279 LOGDEB1("TermProcCom::takeword: pos " << pos << " " << bs << " " << 280 be << " [" << term << "]\n"); 281 bool isstop = m_stops.isStop(term); 282 bool twogramemit = false; 283 284 if (!m_prevterm.empty() && (m_prevstop || isstop)) { 285 // create 2-gram. space unnecessary but improves 286 // the readability of queries 287 string twogram; 288 twogram.swap(m_prevterm); 289 twogram.append(1, ' '); 290 twogram += term; 291 // When emitting a complex term we set the bps to 0. This may 292 // be used by our clients 293 if (!TermProc::takeword(twogram, m_prevpos, 0, 0)) 294 return false; 295 twogramemit = true; 296 #if 0 297 if (m_stops.isStop(twogram)) { 298 firstword = twogram; 299 isstop = false; 300 } 301 #endif 302 } 303 304 m_prevterm = term; 305 m_prevstop = isstop; 306 m_prevpos = pos; 307 m_prevsent = false; 308 m_prevbs = bs; 309 m_prevbe = be; 310 // If flags allow, emit the bare term at the current pos. 311 if (!m_onlygrams || (!isstop && !twogramemit)) { 312 if (!TermProc::takeword(term, pos, bs, be)) 313 return false; 314 m_prevsent = true; 315 } 316 317 return true; 318 } 319 flush()320 virtual bool flush() { 321 if (!m_prevsent && !m_prevterm.empty()) 322 if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe)) 323 return false; 324 325 m_prevterm.clear(); 326 m_prevsent = true; 327 return TermProc::flush(); 328 } onlygrams(bool on)329 void onlygrams(bool on) { 330 m_onlygrams = on; 331 } 332 private: 333 // The stoplist we're using 334 const Rcl::StopList& m_stops; 335 // Remembered data for the last processed term 336 string m_prevterm; 337 bool m_prevstop; 338 int m_prevpos; 339 int m_prevbs; 340 int m_prevbe; 341 bool m_prevsent; 342 // If this is set, we only emit longest grams 343 bool m_onlygrams; 344 }; 345 346 347 } // End namespace Rcl 348 349 #endif /* _TERMPROC_H_INCLUDED_ */ 350 351